diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh
index b7fef829b798..d8772329379c 100755
--- a/.circleci/docker/build.sh
+++ b/.circleci/docker/build.sh
@@ -379,7 +379,7 @@ docker build \
        --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
        --build-arg "KATEX=${KATEX:-}" \
        --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
-       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx900;gfx906}" \
+       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx906}" \
        --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
        --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
        --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
diff --git a/.circleci/docker/common/install_cudnn.sh b/.circleci/docker/common/install_cudnn.sh
index 1f1c34ea200d..4a8829b1cba1 100644
--- a/.circleci/docker/common/install_cudnn.sh
+++ b/.circleci/docker/common/install_cudnn.sh
@@ -4,7 +4,13 @@ if [[ ${CUDNN_VERSION} == 8 ]]; then
     # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
     mkdir tmp_cudnn && cd tmp_cudnn
     CUDNN_NAME="cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive"
-    curl -OLs  https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/${CUDNN_NAME}.tar.xz
+    if [[ ${CUDA_VERSION:0:4} == "11.7" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-8.5.0.96_cuda11-archive"
+        curl -OLs https://ossci-linux.s3.amazonaws.com/${CUDNN_NAME}.tar.xz
+    else
+        curl -OLs  https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/${CUDNN_NAME}.tar.xz
+    fi
+
     tar xf ${CUDNN_NAME}.tar.xz
     cp -a ${CUDNN_NAME}/include/* /usr/include/
     cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/
diff --git a/.circleci/docker/common/install_ucc.sh b/.circleci/docker/common/install_ucc.sh
index 4d691ebb5e9e..333e44e6f779 100755
--- a/.circleci/docker/common/install_ucc.sh
+++ b/.circleci/docker/common/install_ucc.sh
@@ -36,7 +36,7 @@ function install_ucc() {
   git submodule update --init --recursive
 
   ./autogen.sh
-  ./configure --prefix=$UCC_HOME --with-ucx=$UCX_HOME --with-nccl=no --with-cuda=$with_cuda
+  ./configure --prefix=$UCC_HOME --with-ucx=$UCX_HOME --with-cuda=$with_cuda
   time make -j
   sudo make install
 
diff --git a/.circleci/docker/ubuntu-cuda/Dockerfile b/.circleci/docker/ubuntu-cuda/Dockerfile
index 4375b612a308..53349bfec533 100644
--- a/.circleci/docker/ubuntu-cuda/Dockerfile
+++ b/.circleci/docker/ubuntu-cuda/Dockerfile
@@ -118,6 +118,7 @@ COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
 
 # Install CUDNN
 ARG CUDNN_VERSION
+ARG CUDA_VERSION
 COPY ./common/install_cudnn.sh install_cudnn.sh
 RUN if [ "${CUDNN_VERSION}" -eq 8 ]; then bash install_cudnn.sh; fi
 RUN rm install_cudnn.sh
diff --git a/.circleci/scripts/windows_cudnn_install.sh b/.circleci/scripts/windows_cudnn_install.sh
index 763bc950fc4b..c279259e8341 100644
--- a/.circleci/scripts/windows_cudnn_install.sh
+++ b/.circleci/scripts/windows_cudnn_install.sh
@@ -18,7 +18,7 @@ case ${CUDA_VERSION} in
         ;;
     11.7)
         # Use cudnn8.3 with hard-coded cuda11.5 version
-        cudnn_file_name="cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive"
+        cudnn_file_name="cudnn-windows-x86_64-8.5.0.96_cuda11-archive"
         ;;
     *)
         echo "CUDA_VERSION: ${CUDA_VERSION} not supported yet"
diff --git a/.github/ci_commit_pins/torchdynamo.txt b/.github/ci_commit_pins/torchdynamo.txt
index 992c70b96b45..945ad626f6e6 100644
--- a/.github/ci_commit_pins/torchdynamo.txt
+++ b/.github/ci_commit_pins/torchdynamo.txt
@@ -1 +1 @@
-fe3173f7e6c804e6330ac187ea8e4101f45ff9a2
+41c44bc1d080d6cf063419a4166732b983b84eef
diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index 2a1cd7720c6b..a0500b9f6bc1 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-84dcf695d64c15f8a0be845ac65901bdde845429
+a4f53308b2d0f1aa9191686e326f45c26053f686
diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 31bf7123c62e..d536a71eaf88 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-b8688ee3c03120a15978844db6c4fa73eceb6594
+4dec902617aea14ca4013e402eea56e92701cac9
diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml
index 209f71bde842..7f091a55c62f 100644
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@@ -3,6 +3,7 @@
   - .jenkins/caffe2/*
   - aten/src/ATen/core/interned_strings.h
   - docs/source/onnx.rst
+  - docs/source/onnx*
   - docs/source/scripts/onnx/**
   - scripts/onnx/**
   - test/jit/test_export_modes.py
@@ -15,6 +16,8 @@
   - torch/csrc/jit/serialization/onnx.*
   - torch/csrc/onnx/**
   - torch/onnx/**
+  - third_party/onnx
+  - caffe2/python/onnx/**
   approved_by:
   - BowenBao
   - abock
@@ -323,6 +326,7 @@
   - '*'
   approved_by:
   - pytorch/metamates
+  - mruberry
   mandatory_checks_name:
   - Facebook CLA Check
   - Lint
diff --git a/.github/scale-config.yml b/.github/scale-config.yml
deleted file mode 100644
index 1cf99b326ba8..000000000000
--- a/.github/scale-config.yml
+++ /dev/null
@@ -1,69 +0,0 @@
-# scale-config.yml:
-#   Powers what instance types are available for GHA auto-scaled
-#   runners. Runners listed here will be available as self hosted
-#   runners, configuration is directly pulled from the main branch.
-#
-# NOTE (Apr, 5, 2021): Linux runners are currently all an amazonlinux2
-#
-# NOTE (Jan 5, 2021): Linux runners are all non-ephemeral to reduce the amount of CreateInstaces calls
-#                     to avoid RequestLimitExceeded issues
-#
-# TODO: Add some documentation on how the auto-scaling works
-#
-# NOTE: Default values,
-#
-# runner_types:
-#   runner_label:
-#     instance_type: m4.large
-#     os: linux
-#     max_available: 20
-#     disk_size: 50
-#     is_ephemeral: true
-
-runner_types:
-  # mainly used for ciflow-should-run, not made to run any serious tests
-  linux.large:
-    instance_type: c5.large
-    os: linux
-    disk_size: 10
-    is_ephemeral: false
-  linux.2xlarge:
-    instance_type: c5.2xlarge
-    os: linux
-    max_available: 1000
-    disk_size: 150
-    is_ephemeral: false
-  linux.4xlarge: # for binary-builds
-    instance_type: c5.4xlarge
-    os: linux
-    max_available: 500
-    disk_size: 150
-    is_ephemeral: false
-  linux.8xlarge.nvidia.gpu:
-    instance_type: g3.8xlarge
-    os: linux
-    max_available: 200
-    disk_size: 150
-    is_ephemeral: false
-  linux.4xlarge.nvidia.gpu:
-    instance_type: g3.4xlarge
-    os: linux
-    max_available: 250
-    disk_size: 150
-    is_ephemeral: false
-  linux.16xlarge.nvidia.gpu:
-    instance_type: g3.16xlarge
-    os: linux
-    max_available: 10
-    disk_size: 150
-    is_ephemeral: false
-  windows.4xlarge:
-    instance_type: c5d.4xlarge
-    os: windows
-    max_available: 200
-    disk_size: 256
-  windows.8xlarge.nvidia.gpu:
-    instance_type: p3.2xlarge
-    os: windows
-    max_available: 100
-    disk_size: 256
diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
index b1e3b46bda34..0f2693a9aa54 100644
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@@ -13,7 +13,7 @@
 from typing import Dict, List, Tuple, Optional
 
 
-CUDA_ARCHES = ["10.2", "11.3", "11.6", "11.7"]
+CUDA_ARCHES = ["10.2", "11.6", "11.7"]
 
 
 ROCM_ARCHES = ["5.1.1", "5.2"]
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index 653cfeebaab7..3722f4307029 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -207,15 +207,6 @@ class OperatingSystem:
     ),
 ]
 WINDOWS_BINARY_SMOKE_WORKFLOWS = [
-    BinaryBuildWorkflow(
-        os=OperatingSystem.WINDOWS,
-        package_type="wheel",
-        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
-            OperatingSystem.WINDOWS,
-            arches=["11.3"],
-            python_versions=["3.7"]),
-        branches="master",
-    ),
     BinaryBuildWorkflow(
         os=OperatingSystem.WINDOWS,
         package_type="libtorch",
diff --git a/.github/scripts/run_torchbench.py b/.github/scripts/run_torchbench.py
index 51bd1e33f5df..352da69c8158 100644
--- a/.github/scripts/run_torchbench.py
+++ b/.github/scripts/run_torchbench.py
@@ -13,10 +13,12 @@
 # 1. Does not reuse the build artifact in other CI workflows
 # 2. CI jobs are serialized because there is only one worker
 import os
+import boto3  # type: ignore[import]
 import git  # type: ignore[import]
 import pathlib
 import argparse
 import subprocess
+from pathlib import Path
 
 from typing import List, Tuple
 
@@ -31,6 +33,25 @@
 direction: decrease
 timeout: 720
 tests:"""
+S3_BUCKET = "ossci-metrics"
+S3_PREFIX = "torchbench-pr-test"
+S3_URL_BASE = f"https://{S3_BUCKET}.s3.amazonaws.com/"
+
+class S3Client:
+    def __init__(self, bucket: str = S3_BUCKET, prefix: str = S3_PREFIX):
+        self.s3 = boto3.client('s3')
+        self.resource = boto3.resource('s3')
+        self.bucket = bucket
+        self.prefix = prefix
+
+    def upload_file(self, file_path: Path, filekey_prefix: str) -> None:
+        assert file_path.is_file(), f"Specified file path {file_path} does not exist or not file."
+        file_name = file_path.name
+        s3_key = f"{self.prefix}/{filekey_prefix}/{file_name}"
+        print(f"Uploading file {file_name} to S3 with key: {s3_key}")
+        self.s3.upload_file(str(file_path), self.bucket, s3_key)
+        # output the result URL
+        print(f"Uploaded the result file {file_name} to {S3_URL_BASE}{s3_key}")
 
 def gen_abtest_config(control: str, treatment: str, models: List[str]) -> str:
     d = {}
@@ -137,9 +158,21 @@ def run_userbenchmarks(pytorch_path: str, torchbench_path: str, base_sha: str, h
     print(f"Running torchbench userbenchmark command: {command}")
     subprocess.check_call(command, cwd=torchbench_path, env=env)
 
+def process_upload_s3(result_dir: str) -> None:
+    # validate result directory
+    result_dir_path = Path(result_dir)
+    assert result_dir_path.exists(), f"Specified result directory {result_dir} doesn't exist."
+    # upload all files to S3 bucket oss-ci-metrics
+    files = [x for x in result_dir_path.iterdir() if x.is_file()]
+    # upload file to S3 bucket
+    s3_client: S3Client = S3Client()
+    filekey_prefix = result_dir_path.name
+    for f in files:
+        s3_client.upload_file(f, filekey_prefix)
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Run TorchBench tests based on PR')
-    parser.add_argument('--pr-body', required=True, help="The file that contains body of a Pull Request")
+    parser.add_argument('--pr-body', help="The file that contains body of a Pull Request")
 
     subparsers = parser.add_subparsers(dest='command')
     # parser for setup the torchbench branch name env
@@ -151,6 +184,9 @@ def run_userbenchmarks(pytorch_path: str, torchbench_path: str, base_sha: str, h
     run_parser.add_argument('--pr-head-sha', required=True, type=str, help="The Pull Request head hash")
     run_parser.add_argument('--pytorch-path', required=True, type=str, help="Path to pytorch repository")
     run_parser.add_argument('--torchbench-path', required=True, type=str, help="Path to TorchBench repository")
+    # parser to upload results to S3
+    upload_parser = subparsers.add_parser("upload-s3")
+    upload_parser.add_argument('--result-dir', required=True, type=str, help="Path to benchmark output")
     args = parser.parse_args()
 
     if args.command == 'set-torchbench-branch':
@@ -181,6 +217,8 @@ def run_userbenchmarks(pytorch_path: str, torchbench_path: str, base_sha: str, h
         if not models and not userbenchmarks:
             print("Can't parse valid models or userbenchmarks from the pr body. Quit.")
             exit(-1)
+    elif args.command == 'upload-s3':
+        process_upload_s3(args.result_dir)
     else:
         print(f"The command {args.command} is not supported.")
         exit(-1)
diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py
index 610c70cdc0d9..6c28b69ec692 100755
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@@ -912,6 +912,8 @@ def merge_into(self, repo: GitRepo, *,
 
         repo.push(self.default_branch(), dry_run)
         if not dry_run:
+            if land_check_commit:
+                self.delete_land_time_check_branch(repo)
             gh_add_labels(self.org, self.project, self.pr_num, ["merged"])
 
     def merge_changes(self,
@@ -962,6 +964,11 @@ def create_land_time_check_branch(self,
             repo.checkout(orig_branch)
         return commit
 
+    def delete_land_time_check_branch(self,
+                                      repo: GitRepo) -> None:
+        land_check_branch = f'landchecks/{self.pr_num}'
+        repo._run_git('push', 'origin', '-d', land_check_branch)
+
 
 class MandatoryChecksMissingError(Exception):
     pass
@@ -1344,7 +1351,7 @@ def merge(pr_num: int, repo: GitRepo,
     # here to stop the merge process right away
     find_matching_merge_rule(pr, repo, skip_mandatory_checks=True)
 
-    if land_checks:
+    if land_checks and not dry_run:
         land_check_commit = pr.create_land_time_check_branch(
             repo,
             'viable/strict',
@@ -1354,6 +1361,8 @@ def merge(pr_num: int, repo: GitRepo,
 
     gh_post_pr_comment(org, project, pr.pr_num, explainer.get_merge_message(land_check_commit))
     if (datetime.utcnow() - pr.last_pushed_at()).days > stale_pr_days:
+        if land_checks and not dry_run:
+            pr.delete_land_time_check_branch(repo)
         raise RuntimeError("This PR is too stale; the last push date was more than 3 days ago. Please rebase and try again.")
 
     start_time = time.time()
@@ -1366,6 +1375,8 @@ def merge(pr_num: int, repo: GitRepo,
         print(f"Attempting merge of https://github.com/{org}/{project}/pull/{pr_num} ({elapsed_time / 60} minutes elapsed)")
         pr = GitHubPR(org, project, pr_num)
         if initial_commit_sha != pr.last_commit()['oid']:
+            if land_checks and not dry_run:
+                pr.delete_land_time_check_branch(repo)
             raise RuntimeError("New commits were pushed while merging. Please rerun the merge command.")
         try:
             find_matching_merge_rule(pr, repo)
@@ -1400,10 +1411,16 @@ def merge(pr_num: int, repo: GitRepo,
             last_exception = str(ex)
             print(f"Merge of https://github.com/{org}/{project}/pull/{pr_num} failed due to: {ex}. Retrying in 5 min")
             time.sleep(5 * 60)
+        except RuntimeError:
+            if land_checks and not dry_run:
+                pr.delete_land_time_check_branch(repo)
+            raise
     # Finally report timeout back
     msg = f"Merged timed out after {timeout_minutes} minutes. Please contact the pytorch_dev_infra team."
     msg += f"The last exception was: {last_exception}"
     if not dry_run:
+        if land_checks:
+            pr.delete_land_time_check_branch(repo)
         gh_add_labels(org, project, pr_num, ["land-failed"])
     raise RuntimeError(msg)
 
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index 4305ed04e0d2..95ed840025a7 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -117,6 +117,7 @@ jobs:
           NUM_TEST_SHARDS: ${{ matrix.num_shards }}
           PR_BODY: ${{ github.event.pull_request.body }}
           SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+          SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
           SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
           DOCKER_IMAGE: ${{ inputs.docker-image }}
           XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
@@ -171,6 +172,7 @@ jobs:
             -e PR_LABELS \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
+            -e SCCACHE_S3_KEY_PREFIX \
             -e XLA_CUDA \
             -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml
index 20dae569717e..bc3f10de40f9 100644
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@@ -33,6 +33,21 @@ on:
         default: "3.8"
         description: |
           The python version to be used. Will be 3.8 by default
+      test-matrix:
+        required: false
+        type: string
+        description: |
+          An option JSON description of what test configs to run later on. This
+          is moved here from the Linux test workflow so that we can apply filter
+          logic using test-config labels earlier and skip unnecessary builds
+
+    outputs:
+      test-matrix:
+        value: ${{ inputs.test-matrix }}
+        description: An optional JSON description of what test configs to run later on.
+      build-outcome:
+        value: ${{ jobs.build.outputs.build-outcome }}
+        description: The outcome of the build step. This is used to influence test filtering logic later on.
 
     secrets:
       MACOS_SCCACHE_S3_ACCESS_KEY_ID:
@@ -52,6 +67,8 @@ jobs:
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
       BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+    outputs:
+      build-outcome: ${{ steps.build.outcome }}
     steps:
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
@@ -90,7 +107,17 @@ jobs:
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
 
+      # Apply the filter logic to the build step too if the test-config label is already there
+      - name: Select all requested test configurations (if the test matrix is available)
+        id: filter
+        uses: ./.github/actions/filter-test-configs
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          test-matrix: ${{ inputs.test-matrix }}
+
       - name: Build
+        if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == ''
+        id: build
         env:
           OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
         run: |
@@ -98,13 +125,13 @@ jobs:
           ${CONDA_RUN} .jenkins/pytorch/macos-build.sh
 
       - name: Archive artifacts into zip
-        if: inputs.build-generates-artifacts
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped'
         run: |
           zip -1 -r artifacts.zip dist/ build/.ninja_log build/compile_commands.json .pytorch-test-times.json
 
       - name: Store PyTorch Build Artifacts on GHA
         uses: actions/upload-artifact@v2
-        if: inputs.build-generates-artifacts
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped'
         with:
           name: ${{ env.BUILD_ENVIRONMENT }}
           retention-days: 14
@@ -114,7 +141,7 @@ jobs:
       - name: Upload sccache stats to GHA
         uses: actions/upload-artifact@v2
         # Only if sccache is installed, see above
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        if: ${{ (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && steps.build.outcome != 'skipped' }}
         with:
           name: sccache-stats-${{ inputs.build-environment }}-runattempt${{ github.run_attempt }}-${{ steps.get-job-id.outputs.job-id }}
           retention-days: 14
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index bb711ccefb6d..4b3d25717307 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -33,16 +33,38 @@ on:
         description: secret acess key for test stats upload
 
 jobs:
+  # This needs to be run right before the test starts so that it can gather the
+  # latest labels from the PR
+  filter:
+    runs-on: [self-hosted, linux.large]
+    outputs:
+      test-matrix: ${{ steps.filter.outputs.test-matrix }}
+      is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        with:
+          fetch-depth: 1
+          submodules: false
+
+      - name: Select all requested test configurations
+        id: filter
+        uses: ./.github/actions/filter-test-configs
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          test-matrix: ${{ inputs.test-matrix }}
+
   test:
-    # Don't run on forked repos.
-    if: github.repository_owner == 'pytorch'
+    needs: filter
+    # Don't run on forked repos or empty test matrix
+    if: github.repository_owner == 'pytorch' && needs.filter.outputs.is-test-matrix-empty == 'False'
     # For setup-miniconda, see https://github.com/conda-incubator/setup-miniconda/issues/179
     # Also ensure that we always run with the right architecture
     defaults:
       run:
         shell: arch -arch ${{ inputs.arch }} bash -e -l {0}
     strategy:
-      matrix: ${{ fromJSON(inputs.test-matrix) }}
+      matrix: ${{ fromJSON(needs.filter.outputs.test-matrix) }}
       fail-fast: false
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 240
diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml
index fb2195fafce6..d4704129a141 100644
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@@ -23,6 +23,18 @@ on:
         description: |
           If this is set, our linter will use this to make sure that every other
           job with the same `sync-tag` is identical.
+      test-matrix:
+        required: false
+        type: string
+        description: |
+          An option JSON description of what test configs to run later on. This
+          is moved here from the Linux test workflow so that we can apply filter
+          logic using test-config labels earlier and skip unnecessary builds
+
+    outputs:
+      test-matrix:
+        value: ${{ inputs.test-matrix }}
+        description: An optional JSON description of what test configs to run later on.
 
 env:
   GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
@@ -61,7 +73,17 @@ jobs:
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
 
+      # Apply the filter logic to the build step too if the test-config label is already there
+      - name: Select all requested test configurations (if the test matrix is available)
+        id: filter
+        uses: ./.github/actions/filter-test-configs
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          test-matrix: ${{ inputs.test-matrix }}
+
       - name: Build
+        if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == ''
+        id: build
         shell: bash
         env:
           PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/
@@ -89,6 +111,7 @@ jobs:
 
       # Upload to github so that people can click and download artifacts
       - name: Upload artifacts to s3
+        if: steps.build.outcome != 'skipped'
         uses: seemethere/upload-artifact-s3@v5
         with:
           retention-days: 14
@@ -97,6 +120,7 @@ jobs:
           path: C:\${{ github.run_id }}\build-results
 
       - name: Upload sccache stats
+        if: steps.build.outcome != 'skipped'
         uses: seemethere/upload-artifact-s3@v5
         with:
           s3-prefix: |
diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
index 243bd7563639..6d5ae369b709 100644
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@@ -27,11 +27,33 @@ env:
   GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
 
 jobs:
+  # This needs to be run right before the test starts so that it can gather the
+  # latest labels from the PR
+  filter:
+    runs-on: [self-hosted, linux.large]
+    outputs:
+      test-matrix: ${{ steps.filter.outputs.test-matrix }}
+      is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        with:
+          fetch-depth: 1
+          submodules: false
+
+      - name: Select all requested test configurations
+        id: filter
+        uses: ./.github/actions/filter-test-configs
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          test-matrix: ${{ inputs.test-matrix }}
+
   test:
-    # Don't run on forked repos.
-    if: github.repository_owner == 'pytorch'
+    needs: filter
+    # Don't run on forked repos or empty test matrix
+    if: github.repository_owner == 'pytorch' && needs.filter.outputs.is-test-matrix-empty == 'False'
     strategy:
-      matrix: ${{ fromJSON(inputs.test-matrix) }}
+      matrix: ${{ fromJSON(needs.filter.outputs.test-matrix) }}
       fail-fast: false
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 300
diff --git a/.github/workflows/generated-linux-binary-conda-nightly.yml b/.github/workflows/generated-linux-binary-conda-nightly.yml
index 81f779f2f014..0f3148bc28c1 100644
--- a/.github/workflows/generated-linux-binary-conda-nightly.yml
+++ b/.github/workflows/generated-linux-binary-conda-nightly.yml
@@ -153,66 +153,6 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_7-cuda11_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cuda11_3
-      build_environment: linux-binary-conda
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  conda-py3_7-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_3-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cuda11_3
-      build_environment: linux-binary-conda
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_7-cuda11_3-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_3-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cuda11_3
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   conda-py3_7-cuda11_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -450,66 +390,6 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_8-cuda11_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_3
-      build_environment: linux-binary-conda
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  conda-py3_8-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_3-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_3
-      build_environment: linux-binary-conda
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_8-cuda11_3-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_3-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_3
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   conda-py3_8-cuda11_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -747,66 +627,6 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_9-cuda11_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cuda11_3
-      build_environment: linux-binary-conda
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  conda-py3_9-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_3-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cuda11_3
-      build_environment: linux-binary-conda
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_9-cuda11_3-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_3-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cuda11_3
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   conda-py3_9-cuda11_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1044,66 +864,6 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_10-cuda11_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cuda11_3
-      build_environment: linux-binary-conda
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  conda-py3_10-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_3-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cuda11_3
-      build_environment: linux-binary-conda
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_10-cuda11_3-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_3-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.3
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cuda11_3
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   conda-py3_10-cuda11_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
diff --git a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
index cf629ed3358a..53033239bf44 100644
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
@@ -528,258 +528,6 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_3-shared-with-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_3-shared-with-deps-cxx11-abi
-      build_environment: linux-binary-libtorch-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  libtorch-cuda11_3-shared-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-with-deps-cxx11-abi-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_3-shared-with-deps-cxx11-abi
-      build_environment: linux-binary-libtorch-cxx11-abi
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda11_3-shared-with-deps-cxx11-abi-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-with-deps-cxx11-abi-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_3-shared-with-deps-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_3-shared-without-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_3-shared-without-deps-cxx11-abi
-      build_environment: linux-binary-libtorch-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  libtorch-cuda11_3-shared-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-without-deps-cxx11-abi-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_3-shared-without-deps-cxx11-abi
-      build_environment: linux-binary-libtorch-cxx11-abi
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda11_3-shared-without-deps-cxx11-abi-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-without-deps-cxx11-abi-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_3-shared-without-deps-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_3-static-with-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_3-static-with-deps-cxx11-abi
-      build_environment: linux-binary-libtorch-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  libtorch-cuda11_3-static-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-with-deps-cxx11-abi-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_3-static-with-deps-cxx11-abi
-      build_environment: linux-binary-libtorch-cxx11-abi
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda11_3-static-with-deps-cxx11-abi-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-with-deps-cxx11-abi-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_3-static-with-deps-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_3-static-without-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_3-static-without-deps-cxx11-abi
-      build_environment: linux-binary-libtorch-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  libtorch-cuda11_3-static-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-without-deps-cxx11-abi-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_3-static-without-deps-cxx11-abi
-      build_environment: linux-binary-libtorch-cxx11-abi
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda11_3-static-without-deps-cxx11-abi-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-without-deps-cxx11-abi-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_3-static-without-deps-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   libtorch-cuda11_6-shared-with-deps-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
diff --git a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
index 0bf3534290c2..23e39d3345ba 100644
--- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
@@ -528,258 +528,6 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_3-shared-with-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_3-shared-with-deps-pre-cxx11
-      build_environment: linux-binary-libtorch-pre-cxx11
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  libtorch-cuda11_3-shared-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-with-deps-pre-cxx11-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_3-shared-with-deps-pre-cxx11
-      build_environment: linux-binary-libtorch-pre-cxx11
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda11_3-shared-with-deps-pre-cxx11-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-with-deps-pre-cxx11-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_3-shared-with-deps-pre-cxx11
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_3-shared-without-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_3-shared-without-deps-pre-cxx11
-      build_environment: linux-binary-libtorch-pre-cxx11
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  libtorch-cuda11_3-shared-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-without-deps-pre-cxx11-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_3-shared-without-deps-pre-cxx11
-      build_environment: linux-binary-libtorch-pre-cxx11
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda11_3-shared-without-deps-pre-cxx11-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-without-deps-pre-cxx11-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_3-shared-without-deps-pre-cxx11
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_3-static-with-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_3-static-with-deps-pre-cxx11
-      build_environment: linux-binary-libtorch-pre-cxx11
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  libtorch-cuda11_3-static-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-with-deps-pre-cxx11-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_3-static-with-deps-pre-cxx11
-      build_environment: linux-binary-libtorch-pre-cxx11
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda11_3-static-with-deps-pre-cxx11-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-with-deps-pre-cxx11-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_3-static-with-deps-pre-cxx11
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_3-static-without-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_3-static-without-deps-pre-cxx11
-      build_environment: linux-binary-libtorch-pre-cxx11
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  libtorch-cuda11_3-static-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-without-deps-pre-cxx11-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_3-static-without-deps-pre-cxx11
-      build_environment: linux-binary-libtorch-pre-cxx11
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda11_3-static-without-deps-pre-cxx11-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-without-deps-pre-cxx11-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_3-static-without-deps-pre-cxx11
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   libtorch-cuda11_6-shared-with-deps-pre-cxx11-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
index ba912b65fd14..3c94498cc3ad 100644
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -153,66 +153,6 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_7-cuda11_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-cuda11_3
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_7-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cuda11_3-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-cuda11_3
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_7-cuda11_3-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cuda11_3-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-cuda11_3
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   manywheel-py3_7-cuda11_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -730,66 +670,6 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_8-cuda11_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda11_3
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_8-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda11_3-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda11_3
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda11_3-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda11_3-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda11_3
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   manywheel-py3_8-cuda11_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1307,66 +1187,6 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_9-cuda11_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda11_3
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_9-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-cuda11_3-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda11_3
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda11_3-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-cuda11_3-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda11_3
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   manywheel-py3_9-cuda11_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1884,66 +1704,6 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_10-cuda11_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda11_3
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_10-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-cuda11_3-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda11_3
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda11_3-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-cuda11_3-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda11_3
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   manywheel-py3_10-cuda11_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -2461,66 +2221,6 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_11-cuda11_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda11_3
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_11-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_11-cuda11_3-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda11_3
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda11_3-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_11-cuda11_3-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda11_3
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   manywheel-py3_11-cuda11_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
diff --git a/.github/workflows/generated-windows-binary-conda-nightly.yml b/.github/workflows/generated-windows-binary-conda-nightly.yml
index 9b78d6139700..df7cc13d8a26 100644
--- a/.github/workflows/generated-windows-binary-conda-nightly.yml
+++ b/.github/workflows/generated-windows-binary-conda-nightly.yml
@@ -256,7 +256,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_7-cuda11_3-build:
+  conda-py3_7-cuda11_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -266,8 +266,8 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.7"
@@ -343,7 +343,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_7-cuda11_3
+          name: conda-py3_7-cuda11_6
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -360,9 +360,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_7-cuda11_3-test:  # Testing
+  conda-py3_7-cuda11_6-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_3-build
+    needs: conda-py3_7-cuda11_6-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -371,8 +371,8 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.7"
@@ -417,7 +417,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_7-cuda11_3
+          name: conda-py3_7-cuda11_6
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -463,27 +463,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_7-cuda11_3-upload:  # Uploading
+  conda-py3_7-cuda11_6-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_3-test
+    needs: conda-py3_7-cuda11_6-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cuda11_3
+      build_name: conda-py3_7-cuda11_6
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_7-cuda11_6-build:
+  conda-py3_7-cuda11_7-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -493,8 +493,8 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.7"
@@ -570,7 +570,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_7-cuda11_6
+          name: conda-py3_7-cuda11_7
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -587,9 +587,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_7-cuda11_6-test:  # Testing
+  conda-py3_7-cuda11_7-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_6-build
+    needs: conda-py3_7-cuda11_7-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -598,8 +598,8 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.7"
@@ -644,7 +644,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_7-cuda11_6
+          name: conda-py3_7-cuda11_7
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -690,27 +690,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_7-cuda11_6-upload:  # Uploading
+  conda-py3_7-cuda11_7-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_6-test
+    needs: conda-py3_7-cuda11_7-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cuda11_6
+      build_name: conda-py3_7-cuda11_7
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_7-cuda11_7-build:
+  conda-py3_8-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -720,11 +720,10 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -797,7 +796,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_7-cuda11_7
+          name: conda-py3_8-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -814,10 +813,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_7-cuda11_7-test:  # Testing
+  conda-py3_8-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_7-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs: conda-py3_8-cpu-build
+    runs-on: windows.4xlarge
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -825,11 +824,10 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -871,7 +869,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_7-cuda11_7
+          name: conda-py3_8-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -917,27 +915,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_7-cuda11_7-upload:  # Uploading
+  conda-py3_8-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_7-test
+    needs: conda-py3_8-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cuda11_7
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.8"
+      build_name: conda-py3_8-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_8-cpu-build:
+  conda-py3_8-cuda11_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -947,8 +944,9 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.8"
     steps:
@@ -1023,7 +1021,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_8-cpu
+          name: conda-py3_8-cuda11_6
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1040,10 +1038,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cpu-test:  # Testing
+  conda-py3_8-cuda11_6-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cpu-build
-    runs-on: windows.4xlarge
+    needs: conda-py3_8-cuda11_6-build
+    runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -1051,8 +1049,9 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.8"
     steps:
@@ -1096,7 +1095,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_8-cpu
+          name: conda-py3_8-cuda11_6
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -1142,26 +1141,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cpu-upload:  # Uploading
+  conda-py3_8-cuda11_6-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cpu-test
+    needs: conda-py3_8-cuda11_6-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cpu
+      build_name: conda-py3_8-cuda11_6
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_8-cuda11_3-build:
+  conda-py3_8-cuda11_7-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -1171,8 +1171,8 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.8"
@@ -1248,7 +1248,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_8-cuda11_3
+          name: conda-py3_8-cuda11_7
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1265,9 +1265,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cuda11_3-test:  # Testing
+  conda-py3_8-cuda11_7-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_3-build
+    needs: conda-py3_8-cuda11_7-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -1276,8 +1276,8 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.8"
@@ -1322,7 +1322,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_8-cuda11_3
+          name: conda-py3_8-cuda11_7
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -1368,27 +1368,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cuda11_3-upload:  # Uploading
+  conda-py3_8-cuda11_7-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_3-test
+    needs: conda-py3_8-cuda11_7-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_3
+      build_name: conda-py3_8-cuda11_7
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_8-cuda11_6-build:
+  conda-py3_9-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -1398,11 +1398,10 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1475,7 +1474,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_8-cuda11_6
+          name: conda-py3_9-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1492,10 +1491,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cuda11_6-test:  # Testing
+  conda-py3_9-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_6-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs: conda-py3_9-cpu-build
+    runs-on: windows.4xlarge
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -1503,11 +1502,10 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1549,7 +1547,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_8-cuda11_6
+          name: conda-py3_9-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -1595,27 +1593,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cuda11_6-upload:  # Uploading
+  conda-py3_9-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_6-test
+    needs: conda-py3_9-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_6
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.9"
+      build_name: conda-py3_9-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_8-cuda11_7-build:
+  conda-py3_9-cuda11_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -1625,11 +1622,11 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1702,7 +1699,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_8-cuda11_7
+          name: conda-py3_9-cuda11_6
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1719,9 +1716,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cuda11_7-test:  # Testing
+  conda-py3_9-cuda11_6-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_7-build
+    needs: conda-py3_9-cuda11_6-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -1730,11 +1727,11 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1776,7 +1773,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_8-cuda11_7
+          name: conda-py3_9-cuda11_6
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -1822,27 +1819,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cuda11_7-upload:  # Uploading
+  conda-py3_9-cuda11_6-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_7-test
+    needs: conda-py3_9-cuda11_6-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_7
+      DESIRED_PYTHON: "3.9"
+      build_name: conda-py3_9-cuda11_6
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_9-cpu-build:
+  conda-py3_9-cuda11_7-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -1852,8 +1849,9 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
     steps:
@@ -1928,7 +1926,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_9-cpu
+          name: conda-py3_9-cuda11_7
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1945,10 +1943,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_9-cpu-test:  # Testing
+  conda-py3_9-cuda11_7-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cpu-build
-    runs-on: windows.4xlarge
+    needs: conda-py3_9-cuda11_7-build
+    runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -1956,8 +1954,9 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
     steps:
@@ -2001,7 +2000,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_9-cpu
+          name: conda-py3_9-cuda11_7
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -2047,26 +2046,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_9-cpu-upload:  # Uploading
+  conda-py3_9-cuda11_7-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cpu-test
+    needs: conda-py3_9-cuda11_7-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
+      GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cpu
+      build_name: conda-py3_9-cuda11_7
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_9-cuda11_3-build:
+  conda-py3_10-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -2076,11 +2076,10 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2153,7 +2152,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_9-cuda11_3
+          name: conda-py3_10-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2170,10 +2169,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_9-cuda11_3-test:  # Testing
+  conda-py3_10-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_3-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs: conda-py3_10-cpu-build
+    runs-on: windows.4xlarge
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -2181,11 +2180,10 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2227,7 +2225,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_9-cuda11_3
+          name: conda-py3_10-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -2273,688 +2271,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_9-cuda11_3-upload:  # Uploading
+  conda-py3_10-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_3-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cuda11_3
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_9-cuda11_6-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: conda-py3_9-cuda11_6
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_9-cuda11_6-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_6-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_9-cuda11_6
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_9-cuda11_6-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_6-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cuda11_6
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_9-cuda11_7-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: conda-py3_9-cuda11_7
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_9-cuda11_7-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_7-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_9-cuda11_7
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_9-cuda11_7-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_7-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cuda11_7
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_10-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: conda-py3_10-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_10-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cpu-build
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_10-cpu
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_10-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cpu-test
+    needs: conda-py3_10-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
@@ -2971,233 +2290,6 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_10-cuda11_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: conda-py3_10-cuda11_3
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_10-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_3-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_10-cuda11_3
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_10-cuda11_3-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_3-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cuda11_3
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   conda-py3_10-cuda11_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
index 5eb61291b684..c0b5ddae71fa 100644
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@@ -976,962 +976,6 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_3-shared-with-deps-debug-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_3-shared-with-deps-debug
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-shared-with-deps-debug-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-with-deps-debug-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-shared-with-deps-debug
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-shared-with-deps-debug-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-with-deps-debug-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      build_name: libtorch-cuda11_3-shared-with-deps-debug
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_3-shared-without-deps-debug-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_3-shared-without-deps-debug
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-shared-without-deps-debug-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-without-deps-debug-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-shared-without-deps-debug
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-shared-without-deps-debug-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-without-deps-debug-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      build_name: libtorch-cuda11_3-shared-without-deps-debug
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_3-static-with-deps-debug-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: static-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_3-static-with-deps-debug
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-static-with-deps-debug-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-with-deps-debug-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: static-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-static-with-deps-debug
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-static-with-deps-debug-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-with-deps-debug-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: static-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      build_name: libtorch-cuda11_3-static-with-deps-debug
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_3-static-without-deps-debug-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: static-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_3-static-without-deps-debug
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-static-without-deps-debug-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-without-deps-debug-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: static-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-static-without-deps-debug
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-static-without-deps-debug-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-without-deps-debug-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: static-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      build_name: libtorch-cuda11_3-static-without-deps-debug
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   libtorch-cuda11_6-shared-with-deps-debug-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
diff --git a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
index 88458e0b5df8..f2f1d3badfe3 100644
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@@ -976,962 +976,6 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_3-shared-with-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_3-shared-with-deps-release
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-shared-with-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-with-deps-release-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-shared-with-deps-release
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-shared-with-deps-release-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-with-deps-release-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      build_name: libtorch-cuda11_3-shared-with-deps-release
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_3-shared-without-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_3-shared-without-deps-release
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-shared-without-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-without-deps-release-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-shared-without-deps-release
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-shared-without-deps-release-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-shared-without-deps-release-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      build_name: libtorch-cuda11_3-shared-without-deps-release
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_3-static-with-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: static-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_3-static-with-deps-release
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-static-with-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-with-deps-release-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: static-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-static-with-deps-release
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-static-with-deps-release-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-with-deps-release-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: static-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      build_name: libtorch-cuda11_3-static-with-deps-release
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_3-static-without-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: static-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_3-static-without-deps-release
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-static-without-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-without-deps-release-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: static-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_3-static-without-deps-release
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_3-static-without-deps-release-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_3-static-without-deps-release-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: static-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      build_name: libtorch-cuda11_3-static-without-deps-release
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   libtorch-cuda11_6-shared-with-deps-release-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
diff --git a/.github/workflows/generated-windows-binary-wheel-master.yml b/.github/workflows/generated-windows-binary-wheel-master.yml
deleted file mode 100644
index 1cff1102c50a..000000000000
--- a/.github/workflows/generated-windows-binary-wheel-master.yml
+++ /dev/null
@@ -1,236 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: windows-binary-wheel
-
-on:
-  push:
-    branches:
-      - master
-    tags:
-      - 'ciflow/trunk/*'
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  ANACONDA_USER: pytorch
-  AWS_DEFAULT_REGION: us-east-1
-  BUILD_ENVIRONMENT: windows-binary-wheel
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
-concurrency:
-  group: windows-binary-wheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  wheel-py3_7-cuda11_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: wheel-py3_7-cuda11_3
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cuda11_3-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_7-cuda11_3
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
diff --git a/.github/workflows/generated-windows-binary-wheel-nightly.yml b/.github/workflows/generated-windows-binary-wheel-nightly.yml
index 7dc8eb308381..026c81e6bb58 100644
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@@ -256,7 +256,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_7-cuda11_3-build:
+  wheel-py3_7-cuda11_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -266,8 +266,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.7"
@@ -343,7 +343,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_7-cuda11_3
+          name: wheel-py3_7-cuda11_6
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -360,9 +360,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cuda11_3-test:  # Testing
+  wheel-py3_7-cuda11_6-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cuda11_3-build
+    needs: wheel-py3_7-cuda11_6-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -371,8 +371,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.7"
@@ -417,7 +417,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_7-cuda11_3
+          name: wheel-py3_7-cuda11_6
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -463,27 +463,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cuda11_3-upload:  # Uploading
+  wheel-py3_7-cuda11_6-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cuda11_3-test
+    needs: wheel-py3_7-cuda11_6-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.7"
-      build_name: wheel-py3_7-cuda11_3
+      build_name: wheel-py3_7-cuda11_6
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_7-cuda11_6-build:
+  wheel-py3_7-cuda11_7-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -493,8 +493,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.7"
@@ -570,7 +570,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_7-cuda11_6
+          name: wheel-py3_7-cuda11_7
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -587,9 +587,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cuda11_6-test:  # Testing
+  wheel-py3_7-cuda11_7-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cuda11_6-build
+    needs: wheel-py3_7-cuda11_7-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -598,8 +598,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.7"
@@ -644,7 +644,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_7-cuda11_6
+          name: wheel-py3_7-cuda11_7
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -690,27 +690,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cuda11_6-upload:  # Uploading
+  wheel-py3_7-cuda11_7-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cuda11_6-test
+    needs: wheel-py3_7-cuda11_7-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.7"
-      build_name: wheel-py3_7-cuda11_6
+      build_name: wheel-py3_7-cuda11_7
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_7-cuda11_7-build:
+  wheel-py3_8-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -720,11 +720,10 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -797,7 +796,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_7-cuda11_7
+          name: wheel-py3_8-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -814,10 +813,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cuda11_7-test:  # Testing
+  wheel-py3_8-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cuda11_7-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs: wheel-py3_8-cpu-build
+    runs-on: windows.4xlarge
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -825,11 +824,10 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -871,7 +869,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_7-cuda11_7
+          name: wheel-py3_8-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -917,27 +915,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cuda11_7-upload:  # Uploading
+  wheel-py3_8-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cuda11_7-test
+    needs: wheel-py3_8-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.7"
-      build_name: wheel-py3_7-cuda11_7
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.8"
+      build_name: wheel-py3_8-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_8-cpu-build:
+  wheel-py3_8-cuda11_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -947,8 +944,9 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.8"
     steps:
@@ -1023,7 +1021,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_8-cpu
+          name: wheel-py3_8-cuda11_6
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1040,10 +1038,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cpu-test:  # Testing
+  wheel-py3_8-cuda11_6-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cpu-build
-    runs-on: windows.4xlarge
+    needs: wheel-py3_8-cuda11_6-build
+    runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -1051,8 +1049,9 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.8"
     steps:
@@ -1096,7 +1095,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_8-cpu
+          name: wheel-py3_8-cuda11_6
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -1142,26 +1141,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cpu-upload:  # Uploading
+  wheel-py3_8-cuda11_6-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cpu-test
+    needs: wheel-py3_8-cuda11_6-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
+      GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.8"
-      build_name: wheel-py3_8-cpu
+      build_name: wheel-py3_8-cuda11_6
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_8-cuda11_3-build:
+  wheel-py3_8-cuda11_7-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -1171,8 +1171,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.8"
@@ -1248,7 +1248,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_8-cuda11_3
+          name: wheel-py3_8-cuda11_7
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1265,9 +1265,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda11_3-test:  # Testing
+  wheel-py3_8-cuda11_7-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda11_3-build
+    needs: wheel-py3_8-cuda11_7-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -1276,8 +1276,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.8"
@@ -1322,7 +1322,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_8-cuda11_3
+          name: wheel-py3_8-cuda11_7
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -1368,27 +1368,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda11_3-upload:  # Uploading
+  wheel-py3_8-cuda11_7-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda11_3-test
+    needs: wheel-py3_8-cuda11_7-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.8"
-      build_name: wheel-py3_8-cuda11_3
+      build_name: wheel-py3_8-cuda11_7
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_8-cuda11_6-build:
+  wheel-py3_9-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -1398,11 +1398,10 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1475,7 +1474,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_8-cuda11_6
+          name: wheel-py3_9-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1492,10 +1491,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda11_6-test:  # Testing
+  wheel-py3_9-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda11_6-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs: wheel-py3_9-cpu-build
+    runs-on: windows.4xlarge
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -1503,11 +1502,10 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1549,7 +1547,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_8-cuda11_6
+          name: wheel-py3_9-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -1595,27 +1593,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda11_6-upload:  # Uploading
+  wheel-py3_9-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda11_6-test
+    needs: wheel-py3_9-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.8"
-      build_name: wheel-py3_8-cuda11_6
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.9"
+      build_name: wheel-py3_9-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_8-cuda11_7-build:
+  wheel-py3_9-cuda11_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -1625,11 +1622,11 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1702,7 +1699,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_8-cuda11_7
+          name: wheel-py3_9-cuda11_6
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1719,9 +1716,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda11_7-test:  # Testing
+  wheel-py3_9-cuda11_6-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda11_7-build
+    needs: wheel-py3_9-cuda11_6-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -1730,11 +1727,11 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1776,7 +1773,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_8-cuda11_7
+          name: wheel-py3_9-cuda11_6
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -1822,27 +1819,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda11_7-upload:  # Uploading
+  wheel-py3_9-cuda11_6-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda11_7-test
+    needs: wheel-py3_9-cuda11_6-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
+      DESIRED_CUDA: cu116
+      GPU_ARCH_VERSION: 11.6
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.8"
-      build_name: wheel-py3_8-cuda11_7
+      DESIRED_PYTHON: "3.9"
+      build_name: wheel-py3_9-cuda11_6
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_9-cpu-build:
+  wheel-py3_9-cuda11_7-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -1852,8 +1849,9 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
     steps:
@@ -1928,7 +1926,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_9-cpu
+          name: wheel-py3_9-cuda11_7
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1945,10 +1943,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cpu-test:  # Testing
+  wheel-py3_9-cuda11_7-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cpu-build
-    runs-on: windows.4xlarge
+    needs: wheel-py3_9-cuda11_7-build
+    runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -1956,8 +1954,9 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
     steps:
@@ -2001,7 +2000,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_9-cpu
+          name: wheel-py3_9-cuda11_7
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -2047,26 +2046,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cpu-upload:  # Uploading
+  wheel-py3_9-cuda11_7-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cpu-test
+    needs: wheel-py3_9-cuda11_7-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
+      GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.9"
-      build_name: wheel-py3_9-cpu
+      build_name: wheel-py3_9-cuda11_7
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_9-cuda11_3-build:
+  wheel-py3_10-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -2076,11 +2076,10 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2153,7 +2152,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_9-cuda11_3
+          name: wheel-py3_10-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2170,10 +2169,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda11_3-test:  # Testing
+  wheel-py3_10-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda11_3-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs: wheel-py3_10-cpu-build
+    runs-on: windows.4xlarge
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -2181,11 +2180,10 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2227,7 +2225,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_9-cuda11_3
+          name: wheel-py3_10-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -2273,688 +2271,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda11_3-upload:  # Uploading
+  wheel-py3_10-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda11_3-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.9"
-      build_name: wheel-py3_9-cuda11_3
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_9-cuda11_6-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: wheel-py3_9-cuda11_6
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda11_6-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda11_6-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_9-cuda11_6
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda11_6-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda11_6-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.9"
-      build_name: wheel-py3_9-cuda11_6
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_9-cuda11_7-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: wheel-py3_9-cuda11_7
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda11_7-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda11_7-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_9-cuda11_7
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda11_7-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda11_7-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.9"
-      build_name: wheel-py3_9-cuda11_7
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: wheel-py3_10-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cpu-build
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_10-cpu
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cpu-test
+    needs: wheel-py3_10-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
@@ -2971,233 +2290,6 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cuda11_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: wheel-py3_10-cuda11_3
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda11_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda11_3-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_10-cuda11_3
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda11_3-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda11_3-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu113
-      GPU_ARCH_VERSION: 11.3
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-cuda11_3
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   wheel-py3_10-cuda11_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index eb39618619e7..aa386b783264 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -30,7 +30,11 @@ jobs:
             **/.github/requirements-gha-cache.txt
 
       - name: Install lintrunner
-        run: pip install lintrunner==0.9.2
+        uses: nick-fields/retry@7d4a37704547a311dbb66ebdf5b23ec19374a767
+        with:
+          timeout_minutes: 5
+          max_attempts: 3
+          command: pip install lintrunner==0.9.2
 
       - name: Initialize lint dependencies
         run: lintrunner init
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 2a2448104421..3305dc0d0987 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -162,6 +162,12 @@ jobs:
     with:
       build-environment: win-vs2019-cuda11.7-py3
       cuda-version: "11.7"
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 2, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
+        ]}
 
   win-vs2019-cuda11_7-py3-test:
     name: win-vs2019-cuda11.7-py3
@@ -170,12 +176,7 @@ jobs:
     with:
       build-environment: win-vs2019-cuda11.7-py3
       cuda-version: "11.7"
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 2, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
-        ]}
+      test-matrix: ${{ needs.win-vs2019-cuda11_7-py3-build.outputs.test-matrix }}
 
   ios-12-5-1-x86-64-coreml:
     name: ios-12-5-1-x86-64-coreml
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 58a4706897be..6b9e184afc26 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -25,7 +25,8 @@ jobs:
         { include: [
           { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
           { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "distributed", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
           { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
           { config: "docs_test", shard: 1, num_shards: 1,  runner: "linux.2xlarge" },
           { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
@@ -237,6 +238,12 @@ jobs:
     with:
       build-environment: win-vs2019-cpu-py3
       cuda-version: cpu
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "windows.4xlarge" },
+          { config: "default", shard: 2, num_shards: 2, runner: "windows.4xlarge" },
+          { config: "functorch", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
+        ]}
 
   win-vs2019-cpu-py3-test:
     name: win-vs2019-cpu-py3
@@ -245,12 +252,7 @@ jobs:
     with:
       build-environment: win-vs2019-cpu-py3
       cuda-version: cpu
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "windows.4xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "windows.4xlarge" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
-        ]}
+      test-matrix: ${{ needs.win-vs2019-cpu-py3-build.outputs.test-matrix }}
 
   win-vs2019-cuda11_6-py3-build:
     if: github.event_name == 'pull_request'
@@ -260,6 +262,16 @@ jobs:
       build-environment: win-vs2019-cuda11.6-py3
       cuda-version: "11.6"
       sync-tag: win-cuda-build
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "functorch", shard: 1, num_shards: 1, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
+        ]}
 
   linux-bionic-cuda11_6-py3_10-gcc7-bazel-test:
     name: linux-bionic-cuda11.6-py3.10-gcc7-bazel-test
diff --git a/.github/workflows/push_nightly_docker_ghcr.yml b/.github/workflows/push_nightly_docker_ghcr.yml
index bdcc6e05dc59..3eb204db3fa3 100644
--- a/.github/workflows/push_nightly_docker_ghcr.yml
+++ b/.github/workflows/push_nightly_docker_ghcr.yml
@@ -28,7 +28,7 @@ jobs:
       - uses: nick-fields/retry@7d4a37704547a311dbb66ebdf5b23ec19374a767
         name: Build and upload nightly docker
         with:
-          timeout_minutes: 10
+          timeout_minutes: 30
           max_attempts: 3
           command: |
             set -ex
diff --git a/.github/workflows/run_torchbench.yml b/.github/workflows/run_torchbench.yml
index f9c3039fc4f8..9a46a23af5bf 100644
--- a/.github/workflows/run_torchbench.yml
+++ b/.github/workflows/run_torchbench.yml
@@ -10,6 +10,8 @@ env:
   PR_BODY: ${{ github.event.pull_request.body }}
   PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
   PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
 
 jobs:
   run-torchbench:
@@ -39,7 +41,7 @@ jobs:
           # pin cmake version to 3.22 since 3.23 breaks pytorch build
           # see details at: https://github.com/pytorch/pytorch/issues/74985
           conda install -y numpy="${NUMPY_VERSION}" requests ninja pyyaml mkl mkl-include \
-                           setuptools cmake=3.22 cffi typing_extensions \
+                           setuptools cmake=3.22 cffi typing_extensions boto3 \
                            future six dataclasses pillow pytest tabulate gitpython git-lfs tqdm psutil
       - name: Setup TorchBench branch
         run: |
@@ -78,6 +80,13 @@ jobs:
                   --pr-num "$PR_NUM" \
                   --pr-base-sha "$PR_MERGE_BASE" \
                   --pr-head-sha "$PR_HEAD_SHA"
+      - name: Upload result to S3
+        run: |
+          . "${HOME}"/anaconda3/etc/profile.d/conda.sh
+          conda activate pr-ci
+          python3 pytorch/.github/scripts/run_torchbench.py \
+                  upload-s3 \
+                  --result-dir "${HOME}/.torchbench/bisection/pr${{ github.event.number }}"
       - name: Remove conda environment and cleanup
         run: |
           conda env remove --name pr-ci
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 4e29526d438a..e03f7f5b9051 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -141,6 +141,12 @@ jobs:
       xcode-version: "13.3.1"
       runner-type: macos-12-xl
       build-generates-artifacts: true
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "macos-12" },
+          { config: "default", shard: 2, num_shards: 2, runner: "macos-12" },
+          { config: "functorch", shard: 1, num_shards: 1, runner: "macos-12" },
+        ]}
     secrets:
       MACOS_SCCACHE_S3_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       MACOS_SCCACHE_S3_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -151,12 +157,7 @@ jobs:
     needs: macos-12-py3-x86-64-build
     with:
       build-environment: macos-12-py3-x86-64
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "macos-12" },
-          { config: "default", shard: 2, num_shards: 2, runner: "macos-12" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "macos-12" },
-        ]}
+      test-matrix: ${{ needs.macos-12-py3-x86-64-build.outputs.test-matrix }}
       arch: x86_64
     secrets:
       AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
@@ -185,6 +186,11 @@ jobs:
       build-generates-artifacts: true
       # To match the one pre-installed in the m1 runners
       python_version: 3.9.12
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "macos-m1-12" },
+          { config: "default", shard: 2, num_shards: 2, runner: "macos-m1-12" },
+        ]}
     secrets:
       MACOS_SCCACHE_S3_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       MACOS_SCCACHE_S3_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -193,6 +199,7 @@ jobs:
     name: macos-12-py3-arm64-mps
     uses: ./.github/workflows/_mac-test-mps.yml
     needs: macos-12-py3-arm64-build
+    if: needs.macos-12-py3-arm64-build.outputs.build-outcome == 'success'
     with:
       sync-tag: macos-12-py3-arm64-mps-test
       build-environment: macos-12-py3-arm64
@@ -203,11 +210,7 @@ jobs:
     needs: macos-12-py3-arm64-build
     with:
       build-environment: macos-12-py3-arm64
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "macos-m1-12" },
-          { config: "default", shard: 2, num_shards: 2, runner: "macos-m1-12" },
-        ]}
+      test-matrix: ${{ needs.macos-12-py3-arm64-build.outputs.test-matrix }}
       arch: arm64
     secrets:
       AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
@@ -220,14 +223,6 @@ jobs:
       build-environment: win-vs2019-cuda11.6-py3
       cuda-version: "11.6"
       sync-tag: win-cuda-build
-
-  win-vs2019-cuda11_6-py3-test:
-    name: win-vs2019-cuda11.6-py3
-    uses: ./.github/workflows/_win-test.yml
-    needs: win-vs2019-cuda11_6-py3-build
-    with:
-      build-environment: win-vs2019-cuda11.6-py3
-      cuda-version: "11.6"
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
@@ -239,6 +234,15 @@ jobs:
           { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
         ]}
 
+  win-vs2019-cuda11_6-py3-test:
+    name: win-vs2019-cuda11.6-py3
+    uses: ./.github/workflows/_win-test.yml
+    needs: win-vs2019-cuda11_6-py3-build
+    with:
+      build-environment: win-vs2019-cuda11.6-py3
+      cuda-version: "11.6"
+      test-matrix: ${{ needs.win-vs2019-cuda11_6-py3-build.outputs.test-matrix }}
+
   linux-focal-rocm5_2-py3_7-build:
     name: linux-focal-rocm5.2-py3.7
     uses: ./.github/workflows/_linux-build.yml
diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml
index 9ba29af66002..7b2c4336e73c 100644
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@@ -38,8 +38,13 @@ jobs:
           ON_GREEN: ${{ github.event.client_payload.on_green}}
           LAND_CHECKS: ${{ github.event.client_payload.land_checks }}
           COMMENT_ID: ${{ github.event.client_payload.comment_id }}
+          REBASE: ${{ github.event.client_payload.rebase }}
         run: |
           set -ex
+          if [ -n "${REBASE}" ]; then
+            python3 .github/scripts/tryrebase.py "${PR_NUM}" --branch "${REBASE}"
+            git checkout master
+          fi
           if [ -n "${FORCE}" ]; then
             if [ -n "${COMMENT_ID}" ]; then
               python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}"
diff --git a/.gitmodules b/.gitmodules
index 32c0c205948a..282746ed0b53 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -151,3 +151,6 @@
 [submodule "third_party/VulkanMemoryAllocator"]
 	path = third_party/VulkanMemoryAllocator
 	url = https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator.git
+[submodule "third_party/cutlass"]
+	path = third_party/cutlass
+	url = https://github.com/NVIDIA/cutlass.git
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index a215459fcc7e..e808d83cbcf9 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -68,8 +68,13 @@ fi
 pip_install -r requirements.txt || true
 
 # Enable LLVM dependency for TensorExpr testing
-export USE_LLVM=/opt/llvm
-export LLVM_DIR=/opt/llvm/lib/cmake/llvm
+if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
+  export USE_LLVM=/opt/rocm/llvm
+  export LLVM_DIR=/opt/rocm/llvm/lib/cmake/llvm
+else
+  export USE_LLVM=/opt/llvm
+  export LLVM_DIR=/opt/llvm/lib/cmake/llvm
+fi
 
 # TODO: Don't install this here
 if ! which conda; then
@@ -146,9 +151,9 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
   fi
 
   if [[ -n "$CI" && -z "$PYTORCH_ROCM_ARCH" ]]; then
-      # Set ROCM_ARCH to gfx900 and gfx906 for CI builds, if user doesn't override.
-      echo "Limiting PYTORCH_ROCM_ARCH to gfx90[06] for CI builds"
-      export PYTORCH_ROCM_ARCH="gfx900;gfx906"
+      # Set ROCM_ARCH to gfx906 for CI builds, if user doesn't override.
+      echo "Limiting PYTORCH_ROCM_ARCH to gfx906 for CI builds"
+      export PYTORCH_ROCM_ARCH="gfx906"
   fi
 
   # hipify sources
diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh
index 7b592d57c280..61a7cb36178b 100644
--- a/.jenkins/pytorch/common_utils.sh
+++ b/.jenkins/pytorch/common_utils.sh
@@ -141,12 +141,6 @@ function checkout_install_torchdynamo() {
   popd
 }
 
-function install_functorch() {
-  pushd functorch
-  time python setup.py develop
-  popd
-}
-
 function test_functorch() {
   python test/run_test.py --functorch --verbose
 }
diff --git a/.jenkins/pytorch/macos-common.sh b/.jenkins/pytorch/macos-common.sh
index 4df378d505ec..319e88e40aa8 100755
--- a/.jenkins/pytorch/macos-common.sh
+++ b/.jenkins/pytorch/macos-common.sh
@@ -28,7 +28,7 @@ else
     numpy=1.18.5 \
     pyyaml=5.3 \
     setuptools=46.0.0 \
-    cmake=3.19 \
+    cmake=3.22.1 \
     cffi \
     ninja \
     typing_extensions \
diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
index a30e16ba942e..244c9dda7fc1 100755
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@@ -177,7 +177,6 @@ test_dynamo() {
 }
 
 if [[ "${TEST_CONFIG}" == *functorch* ]]; then
-  install_functorch
   test_functorch
 elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then
   test_python_shard "${SHARD_NUMBER}"
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 43e6119d4401..231a47bcc9f5 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -180,9 +180,6 @@ test_dynamo_shard() {
     echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
     exit 1
   fi
-  pushd functorch
-  python setup.py develop
-  popd
   # Temporarily disable test_fx for dynamo pending the investigation on TTS
   # regression in https://github.com/pytorch/torchdynamo/issues/784
   time python test/run_test.py \
@@ -197,8 +194,8 @@ test_dynamo_shard() {
       test_reductions \
       test_namedtensor \
       test_namedtuple_return_api \
-      test_profiler \
-      test_profiler_tree \
+      profiler/test_profiler \
+      profiler/test_profiler_tree \
       test_overrides \
       test_python_dispatch \
       test_fx \
@@ -332,6 +329,14 @@ test_libtorch() {
 
 test_aot_compilation() {
   echo "Testing Ahead of Time compilation"
+  ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
+  ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
+
+  # Make test_reports directory
+  # NB: the ending test_libtorch must match the current function name for the current
+  # test reporting process (in print_test_stats.py) to function as expected.
+  TEST_REPORTS_DIR=test/test-reports/cpp-unittest/test_aot_compilation
+  mkdir -p $TEST_REPORTS_DIR
   if [ -f "$TORCH_BIN_DIR"/test_mobile_nnc ]; then "$TORCH_BIN_DIR"/test_mobile_nnc --gtest_output=xml:$TEST_REPORTS_DIR/test_mobile_nnc.xml; fi
   # shellcheck source=test/mobile/nnc/test_aot_compile.sh
   if [ -f "$TORCH_BIN_DIR"/aot_model_compiler_test ]; then source test/mobile/nnc/test_aot_compile.sh; fi
@@ -686,7 +691,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then
 elif [[ "${TEST_CONFIG}" = docs_test ]]; then
   test_docs_test
 elif [[ "${TEST_CONFIG}" == *functorch* ]]; then
-  install_functorch
   test_functorch
 else
   install_torchvision
diff --git a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
index 7edeca96ed8d..65ba7ef3235c 100644
--- a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
@@ -144,7 +144,7 @@ python setup.py install --cmake && sccache --show-stats && (
   if "%BUILD_ENVIRONMENT%"=="" (
     echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash.
   ) else (
-    7z a %TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torchgen %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\caffe2 && copy /Y "%TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z" "%PYTORCH_FINAL_PACKAGE_DIR%\"
+    7z a %TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torchgen %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\caffe2 %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\functorch && copy /Y "%TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z" "%PYTORCH_FINAL_PACKAGE_DIR%\"
     if errorlevel 1 exit /b
     if not errorlevel 0 exit /b
 
diff --git a/.jenkins/pytorch/win-test-helpers/install_test_functorch.bat b/.jenkins/pytorch/win-test-helpers/install_test_functorch.bat
index 7679bffbc70e..d06d46f3dd22 100644
--- a/.jenkins/pytorch/win-test-helpers/install_test_functorch.bat
+++ b/.jenkins/pytorch/win-test-helpers/install_test_functorch.bat
@@ -6,15 +6,6 @@ if not errorlevel 0 (
   exit /b
 )
 
-pushd functorch
-echo "Install functorch"
-:: --no-deps because for some reason, on windows, `torch` isn't found in
-:: `pip list` despite being installed. With just `python setup.py develop`,
-:: setuptools explicitly checks for the existence of torch and can't find it.
-python setup.py develop --no-deps
-popd
-if ERRORLEVEL 1 goto fail
-
 echo "Installing test dependencies"
 pip install networkx
 if errorlevel 1 exit /b
diff --git a/.lintrunner.toml b/.lintrunner.toml
index b2fa676f8e13..33980c3cbc9f 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -318,6 +318,7 @@ exclude_patterns = [
     'aten/src/ATen/native/vulkan/api/vk_mem_alloc.h',
     'test/cpp/jit/upgrader_models/*.ptl',
     'test/cpp/jit/upgrader_models/*.ptl.ff',
+    'cmake/External/nccl.patch',
 ]
 command = [
     'python3',
@@ -347,6 +348,7 @@ exclude_patterns = [
     'test/cpp/jit/upgrader_models/*.ptl',
     'test/cpp/jit/upgrader_models/*.ptl.ff',
     '.lintrunner.toml',
+    'cmake/External/nccl.patch',
 ]
 command = [
     'python3',
diff --git a/BUILD.bazel b/BUILD.bazel
index dd417c413a6b..2c00e0d1dc56 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -133,6 +133,7 @@ filegroup(
     name = "aten_base_cpp",
     srcs = glob([
         "aten/src/ATen/*.cpp",
+        "aten/src/ATen/functorch/*.cpp",
         "aten/src/ATen/detail/*.cpp",
         "aten/src/ATen/cpu/*.cpp",
     ]),
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 379fa2fd7c7e..3800fe238cd6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -355,6 +355,8 @@ option(USE_PER_OPERATOR_HEADERS "Whether ATen should generate separate headers f
 cmake_dependent_option(
     BUILD_LAZY_TS_BACKEND "Build the lazy Torchscript backend, not compatible with mobile builds" ON
     "NOT INTERN_BUILD_MOBILE" OFF)
+cmake_dependent_option(
+    BUILD_FUNCTORCH "Build Functorch" ON "BUILD_PYTHON" OFF)
 
 
 if(USE_CCACHE)
@@ -572,6 +574,22 @@ if(ANDROID OR IOS OR DEFINED ENV{BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN})
   message(WARNING "INTERN_BUILD_MOBILE is on, disabling BUILD_LAZY_TS_BACKEND")
   set(BUILD_LAZY_TS_BACKEND OFF)
 
+  # Set -ffunction-sections and -fdata-sections so that each method has its own
+  # text section. This allows the linker to remove unused section when the flag
+  # -Wl,-gc-sections is provided at link time.
+  string(APPEND CMAKE_CXX_FLAGS " -ffunction-sections")
+  string(APPEND CMAKE_C_FLAGS " -ffunction-sections")
+  string(APPEND CMAKE_CXX_FLAGS " -fdata-sections")
+  string(APPEND CMAKE_C_FLAGS " -fdata-sections")
+
+  # Please note that the use of the following flags is required when linking
+  # against libtorch_cpu.a for mobile builds.
+  # -Wl,--whole-archive -ltorch_cpu -Wl,--no-whole-archive
+  #
+  # This allows global constructors to be included and run. Global
+  # constructors are used for operator/kernel registration with the
+  # PyTorch Dispatcher.
+
   if(DEFINED ENV{BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN})
     # C10_MOBILE is derived from Android/iOS toolchain macros in
     # c10/macros/Macros.h, so it needs to be explicitly set here.
@@ -590,6 +608,10 @@ endif()
 # INTERN_BUILD_ATEN_OPS is used to control whether to build ATen/TH operators.
 set(INTERN_BUILD_ATEN_OPS ON)
 
+if(NOT DEFINED USE_BLAS)
+  set(USE_BLAS ON)
+endif()
+
 # Build libtorch mobile library, which contains ATen/TH ops and native support for
 # TorchScript model, but doesn't contain not-yet-unified caffe2 ops;
 if(INTERN_BUILD_MOBILE)
@@ -602,13 +624,18 @@ if(INTERN_BUILD_MOBILE)
     set(INTERN_DISABLE_AUTOGRAD ON)
   endif()
   set(BUILD_PYTHON OFF)
+  set(BUILD_FUNCTORCH OFF)
   set(BUILD_CAFFE2_OPS OFF)
   set(USE_DISTRIBUTED OFF)
   set(NO_API ON)
   set(USE_FBGEMM OFF)
   set(USE_QNNPACK OFF)
   set(INTERN_DISABLE_ONNX ON)
-  set(INTERN_USE_EIGEN_BLAS ON)
+  if(USE_BLAS)
+    set(INTERN_USE_EIGEN_BLAS ON)
+  else()
+    set(INTERN_USE_EIGEN_BLAS OFF)
+  endif()
   # Disable developing mobile interpreter for actual mobile build.
   # Enable it elsewhere to capture build error.
   set(INTERN_DISABLE_MOBILE_INTERP ON)
@@ -697,6 +724,13 @@ set(BUILD_ONEDNN_GRAPH OFF)
 
 include(cmake/Dependencies.cmake)
 
+# Moved this cmake set option down here because CMAKE_CUDA_COMPILER_VERSION is not avaialble until now
+option(USE_FLASH_ATTENTION "Whether to build the flash_attention kernel for scaled dot product attention" OFF)
+if(USE_FLASH_ATTENTION)
+    ADD_DEFINITIONS(-DUSE_FLASH_ATTENTION)
+ENDIF()
+
+
 if(USE_CUDA AND (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 10.2) AND (CMAKE_HOST_SYSTEM_NAME MATCHES "Windows"))
   # CUDA < 10.2 doesn't support compiling and extracting header dependencies in
   # one call, so instead CMake calls nvcc twice with && in between.
@@ -1144,3 +1178,7 @@ caffe2_print_configuration_summary()
 if(USE_DEPLOY)
   add_subdirectory(torch/csrc/deploy)
 endif()
+
+if(BUILD_FUNCTORCH)
+  add_subdirectory(functorch)
+endif()
diff --git a/MANIFEST.in b/MANIFEST.in
index acf4c7291f43..403b90b702df 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -26,5 +26,6 @@ recursive-include benchmarks *.*
 recursive-include scripts *.*
 recursive-include mypy_plugins *.*
 recursive-include modules *.*
+recursive-include functorch *.*
 prune */__pycache__
 global-exclude *.o *.so *.dylib *.a .git *.pyc *.swp
diff --git a/aten/src/ATen/BatchedTensorImpl.cpp b/aten/src/ATen/BatchedTensorImpl.cpp
index d5ab588de53d..fdedfa7c6316 100644
--- a/aten/src/ATen/BatchedTensorImpl.cpp
+++ b/aten/src/ATen/BatchedTensorImpl.cpp
@@ -17,7 +17,7 @@ BatchedTensorImpl::BatchedTensorImpl(Tensor value, BatchDims bdims)
 {
   TORCH_INTERNAL_ASSERT(value_.defined());
   set_storage_access_should_throw();
-  set_sizes_strides_policy(SizesStridesPolicy::CustomStrides);
+  set_custom_sizes_strides(SizesStridesPolicy::CustomStrides);
   checkInvariants();
 
   const auto public_dims = value_.dim() - bdims_.size();
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 286d59f3e97d..3055e290094d 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -56,8 +56,8 @@ if(NOT BUILD_CAFFE2 AND NOT BUILD_LITE_INTERPRETER)
   EXCLUDE(ATen_CORE_TEST_SRCS "${ATen_CORE_TEST_SRCS}" ${ATen_CORE_EXCLUDED_TEST_SRCS})
 endif()
 
-file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/*.h" "quantized/*.h")
-file(GLOB base_cpp "*.cpp" "detail/*.cpp" "cpu/*.cpp")
+file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/*.h" "quantized/*.h" "functorch/*.h")
+file(GLOB base_cpp "*.cpp" "detail/*.cpp" "cpu/*.cpp" "functorch/*.cpp")
 file(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh")
 file(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp")
 file(GLOB cuda_nvrtc_stub_h "cuda/nvrtc_stub/*.h")
@@ -130,15 +130,13 @@ file(GLOB native_cuda_h "native/cuda/*.h" "native/cuda/*.cuh")
 file(GLOB native_cuda_linalg_cpp "native/cuda/linalg/*.cpp")
 file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh")
 file(GLOB native_cudnn_cpp "native/cudnn/*.cpp")
-file(GLOB native_nested_cuda_cu "native/nested/cuda/*.cu")
-file(GLOB native_nested_cuda_cpp "native/nested/cuda/*.cpp")
 file(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu")
 file(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp")
 file(GLOB native_quantized_cuda_cu "native/quantized/cuda/*.cu")
 file(GLOB native_quantized_cuda_cpp "native/quantized/cuda/*.cpp")
 file(GLOB native_quantized_cudnn_cpp "native/quantized/cudnn/*.cpp")
-file(GLOB native_transformers_cuda_cu "native/transformers/cuda/*.cu")
-file(GLOB native_transformers_cuda_cpp "native/transformers/cuda/*.cpp")
+file(GLOB native_nested_cuda_cu "native/nested/cuda/*.cu")
+file(GLOB native_nested_cuda_cpp "native/nested/cuda/*.cpp")
 
 file(GLOB native_hip_hip "native/hip/*.hip")
 file(GLOB native_hip_cpp "native/hip/*.cpp")
@@ -151,11 +149,22 @@ file(GLOB native_sparse_hip_hip "native/sparse/hip/*.hip")
 file(GLOB native_sparse_hip_cpp "native/sparse/hip/*.cpp")
 file(GLOB native_quantized_hip_hip "native/quantized/hip/*.hip")
 file(GLOB native_quantized_hip_cpp "native/quantized/hip/*.cpp")
+file(GLOB native_transformers_cuda_cu "native/transformers/cuda/*.cu")
+file(GLOB native_transformers_cuda_cpp "native/transformers/cuda/*.cpp")
 file(GLOB native_transformers_hip_hip "native/transformers/hip/*.hip")
 file(GLOB native_transformers_hip_cpp "native/transformers/hip/*.cpp")
 file(GLOB native_quantized_cudnn_hip_cpp "native/quantized/cudnn/hip/*.cpp")
 file(GLOB native_utils_cpp "native/utils/*.cpp")
 
+# flash_attention sources
+file(GLOB flash_attention_cuda_cu "native/transformers/cuda/flash_attn/*.cu")
+file(GLOB flash_attention_cuda_cpp "native/transformers/cuda/flash_attn/*.cpp")
+
+if(USE_FLASH_ATTENTION)
+  list(APPEND native_transformers_cuda_cu ${flash_attention_cuda_cu})
+  list(APPEND native_transformers_cuda_cpp ${flash_attention_cuda_cpp})
+endif()
+
 # XNNPACK
 file(GLOB native_xnnpack "native/xnnpack/*.cpp")
 
@@ -415,6 +424,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
 endif()
 
 if(USE_CUDA AND NOT USE_ROCM)
+  if(USE_FLASH_ATTENTION)
+    list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
+  endif()
   if($ENV{ATEN_STATIC_CUDA})
     list(APPEND ATen_CUDA_DEPENDENCY_LIBS
       ${CUDA_LIBRARIES}
diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp
index ff91aa0bd14d..daf0b6842365 100644
--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@@ -106,6 +106,35 @@ size_t computeStorageNbytes(
 #endif
 }
 
+// not including mobile-only macros in this function,
+// since mobile shouldn't be using symints.
+SymInt computeStorageNbytes(
+    SymIntArrayRef sizes,
+    SymIntArrayRef strides,
+    SymInt itemsize_bytes,
+    SymInt storage_offset
+  ) {
+  TORCH_CHECK(
+    sizes.size() == strides.size(),
+    "dimensionality of sizes (",
+    sizes.size(),
+    ") must match dimensionality of strides (",
+    strides.size(),
+    ")");
+
+  // size of the underlying storage is 1 bigger than the offset
+  // of the last element according to stride
+  SymInt size = 1;
+  for (const auto i : c10::irange(sizes.size())) {
+    if (sizes[i] == 0) {
+      return 0;
+    }
+
+    size += strides[i] * (sizes[i] - 1);
+  }
+  return itemsize_bytes * (storage_offset + size);
+}
+
 TensorBase empty_generic(
     IntArrayRef size,
     c10::Allocator* allocator,
@@ -140,20 +169,20 @@ TensorBase empty_generic(
   return tensor;
 }
 
-TensorBase empty_strided_generic(
-    IntArrayRef size,
-    IntArrayRef stride,
+template <typename T>
+TensorBase _empty_strided_generic(
+    T size,
+    T stride,
     c10::Allocator* allocator,
     c10::DispatchKeySet ks,
     ScalarType scalar_type) {
   at::detail::check_size_nonnegative(size);
   at::detail::raise_warning_for_complex_half(scalar_type);
   caffe2::TypeMeta dtype = scalarTypeToTypeMeta(scalar_type);
-  size_t size_bytes = computeStorageNbytes(size, stride, dtype.itemsize());
+  auto size_bytes = computeStorageNbytes(size, stride, dtype.itemsize());
   auto storage_impl = c10::make_intrusive<StorageImpl>(
       c10::StorageImpl::use_byte_size_t(),
       size_bytes,
-      allocator->allocate(size_bytes),
       allocator,
       /*resizeable=*/true);
 
@@ -163,6 +192,24 @@ TensorBase empty_strided_generic(
   return tensor;
 }
 
+TensorBase empty_strided_generic(
+    IntArrayRef size,
+    IntArrayRef stride,
+    c10::Allocator* allocator,
+    c10::DispatchKeySet ks,
+    ScalarType scalar_type) {
+  return _empty_strided_generic<IntArrayRef>(size, stride, allocator, ks, scalar_type);
+}
+
+TensorBase empty_strided_symint_generic(
+    SymIntArrayRef size,
+    SymIntArrayRef stride,
+    c10::Allocator* allocator,
+    c10::DispatchKeySet ks,
+    ScalarType scalar_type) {
+  return _empty_strided_generic<SymIntArrayRef>(size, stride, allocator, ks, scalar_type);
+}
+
 TensorBase empty_cpu(IntArrayRef size, ScalarType dtype, bool pin_memory,
                      c10::optional<c10::MemoryFormat> memory_format_opt) {
   auto allocator = GetCPUAllocatorMaybePinned(pin_memory);
@@ -303,9 +350,7 @@ TensorBase empty_symint_meta(
   auto scalar_type = dtype_or_default(dtype_opt);
   auto *allocator = GetAllocator(kMeta);
   constexpr c10::DispatchKeySet meta_dks(c10::DispatchKey::Meta);
-  // TODO: do this.  Note that naive implementation will choke on truly
-  // unknown sizes without on the fly reasoning
-  // at::detail::check_size_nonnegative(size);
+  at::detail::check_size_nonnegative(size);
   at::detail::raise_warning_for_complex_half(scalar_type);
   caffe2::TypeMeta dtype = scalarTypeToTypeMeta(scalar_type);
   SymInt size_bytes = dtype.itemsize();
@@ -343,7 +388,7 @@ TensorBase empty_symint_meta(
       TORCH_CHECK(0, "other memory format not implemented yet");
   }
 
-  tensor.unsafeGetTensorImpl()->set_sym_sizes_and_strides(size, strides);
+  tensor.unsafeGetTensorImpl()->set_sizes_and_strides(size, strides);
 
   return tensor;
 }
@@ -395,4 +440,40 @@ TensorBase empty_strided_meta(
       options.pinned_memory_opt());
 }
 
+TensorBase empty_strided_symint_meta(SymIntArrayRef size, SymIntArrayRef stride,
+                              ScalarType dtype) {
+  auto *allocator = GetAllocator(kMeta);
+  constexpr c10::DispatchKeySet meta_dks(c10::DispatchKey::Meta);
+  return at::detail::empty_strided_symint_generic(
+      size, stride, allocator, meta_dks, dtype);
+}
+
+TensorBase empty_strided_symint_meta(
+    SymIntArrayRef size,
+    SymIntArrayRef stride,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt) {
+  auto device = device_or_default(device_opt);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device.type() == DeviceType::Meta);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);
+
+  auto dtype = dtype_or_default(dtype_opt);
+  return at::detail::empty_strided_symint_meta(size, stride, dtype);
+}
+
+TensorBase empty_strided_symint_meta(
+    SymIntArrayRef size,
+    SymIntArrayRef stride,
+    const TensorOptions &options) {
+  return at::detail::empty_strided_symint_meta(
+      size,
+      stride,
+      optTypeMetaToScalarType(options.dtype_opt()),
+      options.layout_opt(),
+      options.device_opt(),
+      options.pinned_memory_opt());
+}
+
 }} // namespace at::detail
diff --git a/aten/src/ATen/EmptyTensor.h b/aten/src/ATen/EmptyTensor.h
index 06a33601a154..969eeb6dc5ee 100644
--- a/aten/src/ATen/EmptyTensor.h
+++ b/aten/src/ATen/EmptyTensor.h
@@ -4,7 +4,8 @@
 namespace at {
 namespace detail {
 
-inline void check_size_nonnegative(IntArrayRef size) {
+template <class ArrayRefType>
+inline void check_size_nonnegative(ArrayRefType size) {
   for (auto x : size) {
     TORCH_CHECK(
         x >= 0,
@@ -24,6 +25,11 @@ TORCH_API size_t computeStorageNbytes(
     IntArrayRef strides,
     size_t itemsize,
     size_t storage_offset = 0);
+TORCH_API SymInt computeStorageNbytes(
+    SymIntArrayRef sizes,
+    SymIntArrayRef strides,
+    SymInt itemsize,
+    SymInt storage_offset = 0);
 
 TORCH_API TensorBase empty_generic(
     IntArrayRef size,
@@ -39,6 +45,13 @@ TORCH_API TensorBase empty_strided_generic(
     c10::DispatchKeySet ks,
     ScalarType scalar_type);
 
+TORCH_API TensorBase empty_strided_symint_generic(
+    SymIntArrayRef size,
+    SymIntArrayRef stride,
+    c10::Allocator* allocator,
+    c10::DispatchKeySet ks,
+    ScalarType scalar_type);
+
 TORCH_API TensorBase empty_cpu(
     IntArrayRef size,
     ScalarType dtype,
@@ -113,5 +126,23 @@ TORCH_API TensorBase empty_strided_meta(
     IntArrayRef stride,
     const TensorOptions& options);
 
+TORCH_API TensorBase empty_strided_symint_meta(
+    SymIntArrayRef size,
+    SymIntArrayRef stride,
+    ScalarType dtype);
+
+TORCH_API TensorBase empty_strided_symint_meta(
+    SymIntArrayRef size,
+    SymIntArrayRef stride,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt);
+
+TORCH_API TensorBase empty_strided_symint_meta(
+    SymIntArrayRef size,
+    SymIntArrayRef stride,
+    const TensorOptions& options);
+
 } // namespace detail
 } // namespace at
diff --git a/aten/src/ATen/FunctionalInverses.cpp b/aten/src/ATen/FunctionalInverses.cpp
index 41c4e22a33de..6ae5f3b927c7 100644
--- a/aten/src/ATen/FunctionalInverses.cpp
+++ b/aten/src/ATen/FunctionalInverses.cpp
@@ -228,6 +228,11 @@ Tensor FunctionalInverses::transpose_copy_int_inverse(const Tensor& base, const
     }
 }
 
+Tensor FunctionalInverses::_nested_view_from_buffer_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, const Tensor& nested_size_tensor, const Tensor& nested_stride_tensor, IntArrayRef offsets) {
+    TORCH_INTERNAL_ASSERT(false, "Attempted to call _nested_view_from_buffer() during the functionalization pass. For now, nested tensors aren't supported during functionalization");
+    return Tensor();
+}
+
 Tensor FunctionalInverses::unsqueeze_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t dim) {
     if (reapply_views) {
       return at::squeeze(mutated_view, dim);
diff --git a/aten/src/ATen/FunctionalStorageImpl.cpp b/aten/src/ATen/FunctionalStorageImpl.cpp
index 7f136759ef6a..e50ffbdcf511 100644
--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@@ -1,5 +1,6 @@
 #include <ATen/FunctionalStorageImpl.h>
 
+#include <ATen/EmptyTensor.h>
 #include <ATen/FunctionalTensorWrapper.h>
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <c10/core/CPUAllocator.h>
@@ -90,10 +91,24 @@ bool Alias::apply_updates() {
   return any_updates;
 }
 
+c10::SymInt get_nbytes(const Tensor& value) {
+  if (value.unsafeGetTensorImpl()->has_symbolic_sizes_strides()) {
+    // Today, the two implementations of SymInt are in Python (proxy tensor),
+    // and lazy tensor (LTC/XLA).
+    // LTC hasn't implemented SymInt support yet though (torch::lazy::SymIntNodeImpl).
+    // Once it does, we should remove this check.
+    if (value.key_set().has(c10::DispatchKey::Python)) {
+      return value.storage().sym_nbytes();
+    }
+  }
+  // XLA storage objects also do not properly track nbytes.
+  return at::detail::computeStorageNbytes(value.sizes(), value.strides(), value.dtype().itemsize(), value.storage_offset());
+}
+
 FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& value)
   : c10::StorageImpl(
       c10::StorageImpl::use_byte_size_t(),
-      value.numel() * value.dtype().itemsize(),
+      get_nbytes(value),
       DataPtr{nullptr, value.device()},
       GetAllocator(kMeta),
       /*resizeable=*/true
diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp
index 0692982ec467..2c60d3e77ba4 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@@ -49,6 +49,9 @@ void FunctionalTensorWrapper::set_constructor_metadata() {
   // Instead, it's sufficient to remove the `Dense` dispatch key,
   // which prevents us from accidentally trying to directly run a CPU/CUDA kernel.
   key_set_ = key_set_.remove(c10::DispatchKey::Dense);
+  // We override a bunch of _custom(), so make sure they get called
+  // TODO: metadata copying may not actually be necessary then
+  set_custom_sizes_strides(SizesStridesPolicy::CustomSizes);
 }
 
 FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& value)
@@ -343,12 +346,12 @@ int64_t FunctionalTensorWrapper::numel_custom() const {
 bool FunctionalTensorWrapper::is_contiguous_custom(at::MemoryFormat memory_format) const {
   return value_.unsafeGetTensorImpl()->is_contiguous();
 }
-c10::SymIntArrayRef FunctionalTensorWrapper::sym_sizes() const {
-  return value_.unsafeGetTensorImpl()->sym_sizes();
-}
 c10::SymIntArrayRef FunctionalTensorWrapper::sym_sizes_custom() const {
   return value_.unsafeGetTensorImpl()->sym_sizes();
 }
+c10::SymIntArrayRef FunctionalTensorWrapper::sym_strides_custom() const {
+  return value_.unsafeGetTensorImpl()->sym_strides();
+}
 
 namespace functionalization {
 namespace impl {
diff --git a/aten/src/ATen/FunctionalTensorWrapper.h b/aten/src/ATen/FunctionalTensorWrapper.h
index c5c0339fc1bf..cf389715795a 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@@ -134,15 +134,15 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   ~FunctionalTensorWrapper() override = default;
 
   // FunctionalTensorWrapper overrides all custom size/stride function,
-  // so that if the inner tensor has a custo implementation
+  // so that if the inner tensor has a custom implementation
   // we make sure to call that implementation.
   at::IntArrayRef sizes_custom() const override;
   at::IntArrayRef strides_custom() const override;
   int64_t dim_custom() const override;
   int64_t numel_custom() const override;
   bool is_contiguous_custom(at::MemoryFormat memory_format) const override;
-  c10::SymIntArrayRef sym_sizes() const override;
   c10::SymIntArrayRef sym_sizes_custom() const override;
+  c10::SymIntArrayRef sym_strides_custom() const override;
 
  private:
   const char* tensorimpl_type_name() const override;
diff --git a/aten/src/ATen/InferSize.h b/aten/src/ATen/InferSize.h
index e0bedb751bf2..594b87373a20 100644
--- a/aten/src/ATen/InferSize.h
+++ b/aten/src/ATen/InferSize.h
@@ -2,6 +2,8 @@
 
 #include <ATen/DimVector.h>
 #include <c10/core/ScalarType.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/util/DimVector.h>
 #include <c10/util/Optional.h>
 #include <sstream>
 #include <vector>
@@ -14,9 +16,13 @@ namespace at {
 // templated to handle std::vector<int64_t> and DimVector use cases, see
 // below
 //
-template <typename ResultVec>
-inline void infer_size_impl(IntArrayRef shape, int64_t numel, ResultVec& res) {
-  int64_t newsize = 1;
+template <typename InputArrayRef, typename NumelType, typename ResultVec>
+inline void infer_size_impl(
+    InputArrayRef shape,
+    NumelType numel,
+    ResultVec& res) {
+  NumelType newsize = 1;
+  // N.B. this is an index, not a sym dim!
   auto infer_dim = c10::optional<int64_t>();
   for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) {
     if (shape[dim] == -1) {
@@ -69,4 +75,13 @@ inline at::DimVector infer_size_dv(IntArrayRef shape, int64_t numel) {
   return res;
 }
 
+inline at::SymDimVector infer_size_dv(
+    c10::SymIntArrayRef shape,
+    c10::SymInt numel) {
+  auto res = at::SymDimVector(shape);
+  infer_size_impl<c10::SymIntArrayRef, c10::SymInt, at::SymDimVector>(
+      shape, numel, res);
+  return res;
+}
+
 } // namespace at
diff --git a/aten/src/ATen/NestedTensorImpl.cpp b/aten/src/ATen/NestedTensorImpl.cpp
index 1d3efc6f06bf..fb89f46d86b7 100644
--- a/aten/src/ATen/NestedTensorImpl.cpp
+++ b/aten/src/ATen/NestedTensorImpl.cpp
@@ -4,6 +4,7 @@
 #include <ATen/core/op_registration/op_registration.h>
 #include <ATen/NestedTensorImpl.h>
 #include <c10/core/DispatchKey.h>
+#include <c10/core/DispatchKeySet.h>
 #include <c10/util/Exception.h>
 #include <c10/core/TensorImpl.h>
 
@@ -25,6 +26,46 @@ inline void validate_nested_tensor_metadata(
       (size_dim == 0 && (int64_t)offsets.empty()) ||
       (size_dim == 2 && nested_sizes.size(0) == (int64_t)offsets.size()));
 }
+
+/**
+ * Generates a nested key_set from a non-nested tensor.
+ *
+ * When creating a nested tensor from a non-nested tensor
+ * We want to maintain the same keyset as the buffer but
+ * swap non nested keys for nested ones
+ *
+ * @return Appropriate key set for nested tensor
+ */
+inline c10::DispatchKeySet generate_nested_key_set_from_buffer(
+    const at::Tensor& buffer) {
+  auto nested_key_set = buffer.key_set();
+  const bool has_autograd = nested_key_set.has_any(c10::autograd_dispatch_keyset);
+  // Remove non_nested tensor specific keys
+  nested_key_set = nested_key_set -
+      c10::DispatchKeySet{c10::DispatchKey::Dense, c10::DispatchKey::Autograd};
+
+  // Add nested tensor specific keys
+  nested_key_set =
+      nested_key_set | c10::DispatchKeySet{c10::DispatchKey::NestedTensor};
+  nested_key_set =
+      has_autograd ? nested_key_set | c10::autograd_nested : nested_key_set;
+  return nested_key_set;
+}
+
+/**
+ * Generates a the correct view keyset.
+ *
+ * When creating a nested tensor view of base
+ * The appropriate keyset will be dependent on the nested
+ * status of the base
+ *
+ * @return Appropriate key set for nested tensor
+ */
+c10::DispatchKeySet get_view_key_set(const at::Tensor& base) {
+  return base.is_nested() ? base.key_set()
+                          : generate_nested_key_set_from_buffer(base);
+}
+
 } // namespace
 namespace at {
 namespace native {
@@ -119,19 +160,6 @@ inline std::vector<int64_t> construct_offsets(const at::Tensor& sizes) {
   return offsets;
 }
 
-// [Note: Nested Tensor Autograd] The Nested Tensor key is a functionality
-// key and therefore getAutogradRelatedKeySetFromBackend will return the
-// wrong autograd key. For this specific impl we make sure to register the
-// correct Autograd key which is AutogradNestedTensor
-c10::DispatchKeySet generate_nested_key_set(at::Tensor buffer) {
-  c10::DispatchKeySet key_set =
-      c10::DispatchKeySet(DispatchKey::NestedTensor) | c10::DispatchKeySet{buffer.key_set().highestBackendKey()};
-
-  // Add AutogradNestedTensor specific keys
-  key_set = key_set | inplace_or_view_ks | autograd_nested;
-  return key_set;
-}
-
 NestedTensorImpl::NestedTensorImpl(
     Storage storage,
     c10::DispatchKeySet key_set,
@@ -154,7 +182,7 @@ NestedTensorImpl::NestedTensorImpl(
       storage_device);
   validate_nested_tensor_metadata(nested_size_tensor_, nested_stride_tensor_, offsets_);
   refresh_dim();
-  set_sizes_strides_policy(c10::TensorImpl::SizesStridesPolicy::CustomSizes);
+  set_custom_sizes_strides(c10::TensorImpl::SizesStridesPolicy::CustomSizes);
 }
 
 NestedTensorImpl::NestedTensorImpl(
@@ -164,7 +192,7 @@ NestedTensorImpl::NestedTensorImpl(
     std::vector<int64_t>&& offsets)
     : NestedTensorImpl(
           buffer.storage(),
-          generate_nested_key_set(buffer),
+          generate_nested_key_set_from_buffer(buffer),
           buffer.dtype(),
           nested_size_tensor,
           nested_stride_tensor,
@@ -195,15 +223,14 @@ NestedTensorImpl::NestedTensorImpl(
     at::Tensor nested_size_tensor,
     at::Tensor nested_stride_tensor,
     std::vector<int64_t>&& offsets)
-    : TensorImpl(impl_type, Storage(base_tensor.storage()), base_tensor.key_set(), base_tensor.dtype()),
+    : TensorImpl(impl_type, Storage(base_tensor.storage()), get_view_key_set(base_tensor), base_tensor.dtype()),
       nested_size_tensor_(std::move(nested_size_tensor)),
       nested_stride_tensor_(std::move(nested_stride_tensor)),
       offsets_(std::move(offsets)),
       opt_sizes_(construct_opt_sizes(nested_size_tensor_)) {
-  TORCH_INTERNAL_ASSERT(base_tensor.is_nested());
   validate_nested_tensor_metadata(nested_size_tensor_, nested_stride_tensor_, offsets_);
   refresh_dim();
-  set_sizes_strides_policy(c10::TensorImpl::SizesStridesPolicy::CustomSizes);
+  set_custom_sizes_strides(c10::TensorImpl::SizesStridesPolicy::CustomSizes);
 }
 
 void NestedTensorImpl::refresh_dim() {
@@ -256,9 +283,6 @@ c10::SymIntArrayRef NestedTensorImpl::sym_sizes_custom() const {
   TORCH_CHECK(false, "Internal error: NestedTensorImpl doesn't support sizes. Please file an issue on https://github.com/pytorch/nestedtensor");
 }
 
-c10::SymIntArrayRef NestedTensorImpl::sym_sizes() const {
-  return sym_sizes_custom();
-}
 c10::SymIntArrayRef NestedTensorImpl::sym_strides_custom() const {
   TORCH_CHECK(false, "Internal error: NestedTensorImpl doesn't support strides. Please file an issue on https://github.com/pytorch/nestedtensor");
 }
diff --git a/aten/src/ATen/NestedTensorImpl.h b/aten/src/ATen/NestedTensorImpl.h
index f1fb8273c290..278df3c0d203 100644
--- a/aten/src/ATen/NestedTensorImpl.h
+++ b/aten/src/ATen/NestedTensorImpl.h
@@ -109,7 +109,6 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
   }
   IntArrayRef sizes_custom() const override;
   c10::SymIntArrayRef sym_sizes_custom() const override;
-  c10::SymIntArrayRef sym_sizes() const override;
   IntArrayRef strides_custom() const override;
   c10::SymIntArrayRef sym_strides_custom() const override;
 
@@ -168,7 +167,7 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
    * is generated and redispatched to a non-nested kernel this function
    * generates the key set used by that buffer tensor
    *
-   * @return A newly constructed view tensor
+   * @return Appropriate key set for non-nested tensor
    */
   inline c10::DispatchKeySet generate_buffer_key_set() const {
     auto buffer_key_set = this->key_set();
@@ -185,6 +184,7 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
     buffer_key_set = Autograd
         ? c10::DispatchKeySet{c10::DispatchKey::Autograd} | buffer_key_set
         : buffer_key_set;
+
     return buffer_key_set;
   }
 };
diff --git a/aten/src/ATen/OpaqueTensorImpl.h b/aten/src/ATen/OpaqueTensorImpl.h
index 1888c65725ee..e6c6413815bb 100644
--- a/aten/src/ATen/OpaqueTensorImpl.h
+++ b/aten/src/ATen/OpaqueTensorImpl.h
@@ -30,7 +30,7 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl {
       : TensorImpl(key_set, data_type, device),
         opaque_handle_(std::move(opaque_handle)) {
     set_storage_access_should_throw();
-    set_sizes_strides_policy(SizesStridesPolicy::CustomStrides);
+    set_custom_sizes_strides(SizesStridesPolicy::CustomStrides);
     sizes_and_strides_.set_sizes(sizes);
     refresh_numel();
     is_non_overlapping_and_dense_ = is_non_overlapping_and_dense;
diff --git a/aten/src/ATen/SparseCsrTensorImpl.cpp b/aten/src/ATen/SparseCsrTensorImpl.cpp
index 69fc013211f9..4adc602b14ce 100644
--- a/aten/src/ATen/SparseCsrTensorImpl.cpp
+++ b/aten/src/ATen/SparseCsrTensorImpl.cpp
@@ -68,7 +68,7 @@ SparseCsrTensorImpl::SparseCsrTensorImpl(
                   "to https://github.com/pytorch/pytorch/issues.");
   set_storage_access_should_throw();
   is_non_overlapping_and_dense_ = false;
-  set_sizes_strides_policy(SizesStridesPolicy::CustomStrides);
+  set_custom_sizes_strides(SizesStridesPolicy::CustomStrides);
   // TODO: If this check ever shows up as a bottleneck, which is unlikely given that
   // comparing devices only involves comparing the type and index (two integers), we
   // can move this to a DEBUG only assert. Until then this confirms and maintains a
@@ -104,10 +104,51 @@ void SparseCsrTensorImpl::resize_(int64_t nnz, IntArrayRef size) {
   sizes_and_strides_.set_sizes(size);
 }
 
+void SparseCsrTensorImpl::resize_and_clear_(int64_t sparse_dim, IntArrayRef size) {
+  TORCH_CHECK(
+      !has_symbolic_sizes_strides_,
+      "resize_as_sparse_csr_tensor_ called on tensor with symbolic shape");
+  TORCH_CHECK(sparse_dim >= 2, "resize_and_clear_ sparse dimensionality must be at least 2, got ", sparse_dim);
+  TORCH_CHECK(static_cast<int64_t>(size.size()) >= sparse_dim, "resize_and_clear_ size length must be at least sparse dimensionality (=",
+              sparse_dim, "), got ", size.size());
+  auto batch_dim = sparse_dim - 2;
+  auto batchsize = size.slice(0, batch_dim);
+  auto densesize = size.slice(batch_dim + 2, size.size() - batch_dim - 2);
+
+  auto values_size = DimVector(batchsize);
+  values_size.push_back(0); // nse
+  values_size.append(densesize.begin(), densesize.end());
+
+  auto col_indices_size = DimVector(batchsize);
+  col_indices_size.push_back(0); // nse
+
+  auto n_compressed_indices = AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(layout_, "resize_and_clear_",
+                                                                        [&] () -> int64_t { return size[batch_dim]; },
+                                                                        [&] () -> int64_t { return size[batch_dim + 1]; }
+                                                                        );
+  AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(layout_,
+                                              "resize_and_clear_",
+                                              [] () {},
+                                              [&] () {
+                                                auto blocksize = this->values_.sizes().slice(this->batch_dim() + 1, 2);
+                                                values_size.append(blocksize.begin(), blocksize.end());
+                                                n_compressed_indices /= blocksize[(the_layout == kSparseBsr ? 0 : 1)];
+                                              });
+  auto crow_indices_size = DimVector(batchsize);
+  crow_indices_size.push_back(n_compressed_indices + 1);
+
+  crow_indices_.resize_(crow_indices_size);
+  crow_indices_.zero_();
+  col_indices_.resize_(col_indices_size);
+  values_.resize_(values_size);
+  sizes_and_strides_.set_sizes(size);
+  refresh_numel();
+}
+
 void SparseCsrTensorImpl::resize_as_sparse_csr_tensor_(const Tensor& src) {
   TORCH_CHECK(
       !has_symbolic_sizes_strides_,
-      "resize_as_sparse_csr_tensor_ called on tensor with symbolic shape")
+      "resize_as_sparse_csr_tensor_ called on tensor with symbolic shape");
   set_layout(src.layout());
   crow_indices_ = at::empty_like(
       src.crow_indices(),
@@ -132,7 +173,7 @@ void SparseCsrTensorImpl::set_member_tensors(
     IntArrayRef size) {
   TORCH_CHECK(
       !has_symbolic_sizes_strides_,
-      "set_member_tensors called on tensor with symbolic shape")
+      "set_member_tensors called on tensor with symbolic shape");
 
   // CSR Type Invariants
   TORCH_CHECK(
@@ -172,5 +213,8 @@ void SparseCsrTensorImpl::set_stride(int64_t dim, int64_t new_stride) {
 void SparseCsrTensorImpl::set_storage_offset(int64_t storage_offset) {
   TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have set_storage_offset.");
 }
+bool SparseCsrTensorImpl::is_contiguous_custom(MemoryFormat) const {
+  TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have is_contiguous");
+}
 
 } // namespace at
diff --git a/aten/src/ATen/SparseCsrTensorImpl.h b/aten/src/ATen/SparseCsrTensorImpl.h
index 1f84fb422fde..9d361be15674 100644
--- a/aten/src/ATen/SparseCsrTensorImpl.h
+++ b/aten/src/ATen/SparseCsrTensorImpl.h
@@ -37,6 +37,7 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
       const caffe2::TypeMeta);
 
   void resize_(int64_t nnz, IntArrayRef size);
+  void resize_and_clear_(int64_t sparse_dim, IntArrayRef size);
   void resize_as_sparse_csr_tensor_(const Tensor& src);
   void set_member_tensors(
       const Tensor& crow_indices,
@@ -77,6 +78,7 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
  protected:
   IntArrayRef strides_custom() const override;
   SymIntArrayRef sym_strides_custom() const override;
+  bool is_contiguous_custom(MemoryFormat) const override;
 
  public:
   void set_size(int64_t dim, int64_t new_size) override;
diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp
index 99dcec4d6162..197ae2143896 100644
--- a/aten/src/ATen/SparseTensorImpl.cpp
+++ b/aten/src/ATen/SparseTensorImpl.cpp
@@ -46,7 +46,7 @@ SparseTensorImpl::SparseTensorImpl(at::DispatchKeySet key_set, const caffe2::Typ
 
   is_non_overlapping_and_dense_ = false;
   set_storage_access_should_throw();
-  set_sizes_strides_policy(SizesStridesPolicy::CustomStrides);
+  set_custom_sizes_strides(SizesStridesPolicy::CustomStrides);
 }
 
   // Destructor doesn't call release_resources because it's
diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp
index 7fbddd7a3482..e014b650f989 100644
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@@ -310,12 +310,12 @@ std::vector<int64_t> defaultStrides(IntArrayRef sizes) {
 // templatized for DimVector and IntArrayRef use cases,
 // see overloads of computeStride() below.
 //
-template <typename ResultVec, typename NewShapeVec>
+template <typename ResultVec, typename NewShapeVec, typename Numel>
 inline c10::optional<ResultVec> computeStride_impl(
-    IntArrayRef oldshape,
-    IntArrayRef oldstride,
+    const NewShapeVec& oldshape,
+    const NewShapeVec& oldstride,
     const NewShapeVec& newshape,
-    ResultVec toResult(const IntArrayRef&)
+    ResultVec toResult(const NewShapeVec&)
 ) {
   if (oldshape.empty()) {
     return ResultVec(newshape.size(), 1);
@@ -326,7 +326,7 @@ inline c10::optional<ResultVec> computeStride_impl(
   // we use the stride as if it were computed via resize.
   // This could perhaps be combined with the below code, but the complexity
   // didn't seem worth it.
-  const int64_t numel = c10::multiply_integers(oldshape);
+  const Numel numel = c10::multiply_integers(oldshape);
   if (numel == 0 && oldshape.equals(newshape)) {
     return toResult(oldstride);
   }
@@ -338,7 +338,7 @@ inline c10::optional<ResultVec> computeStride_impl(
         newstride[view_d] = 1;
       } else {
         newstride[view_d] =
-          std::max<int64_t>(newshape[view_d+1], 1) * newstride[view_d+1];
+          std::max<Numel>(newshape[view_d+1], Numel(1)) * newstride[view_d+1];
       }
     }
     return newstride;
@@ -346,10 +346,10 @@ inline c10::optional<ResultVec> computeStride_impl(
 
   int64_t view_d = (int64_t)newshape.size() - 1;
   // stride for each subspace in the chunk
-  int64_t chunk_base_stride = oldstride.back();
+  Numel chunk_base_stride = oldstride.back();
   // numel in current chunk
-  int64_t tensor_numel = 1;
-  int64_t view_numel = 1;
+  Numel tensor_numel = 1;
+  Numel view_numel = 1;
   for (int64_t tensor_d = oldshape.size() - 1; tensor_d >= 0; tensor_d--) {
     tensor_numel *= oldshape[tensor_d];
     // if end of tensor size chunk, check view
@@ -383,7 +383,15 @@ c10::optional<std::vector<int64_t>> computeStride(
     IntArrayRef oldstride,
     IntArrayRef newshape) {
   auto toResult = [](const IntArrayRef& a) { return a.vec(); };
-  return computeStride_impl<std::vector<int64_t>, IntArrayRef>(oldshape, oldstride, newshape, toResult);
+  return computeStride_impl<std::vector<int64_t>, IntArrayRef, int64_t>(oldshape, oldstride, newshape, toResult);
+}
+
+c10::optional<SymDimVector> computeStride(
+    c10::SymIntArrayRef oldshape,
+    c10::SymIntArrayRef oldstride,
+    c10::SymIntArrayRef newshape) {
+  auto toResult = [](const SymIntArrayRef& a) { return SymDimVector(a); };
+  return computeStride_impl<SymDimVector, c10::SymIntArrayRef, c10::SymInt>(oldshape, oldstride, newshape, toResult);
 }
 
 c10::optional<DimVector> computeStride(
@@ -391,7 +399,7 @@ c10::optional<DimVector> computeStride(
     IntArrayRef oldstride,
     const DimVector& newshape) {
   auto toResult = [](const IntArrayRef& a) { return DimVector(a); };
-  return computeStride_impl<DimVector, DimVector>(oldshape, oldstride, newshape, toResult);
+  return computeStride_impl<DimVector, IntArrayRef, int64_t>(oldshape, oldstride, newshape, toResult);
 }
 
 }  // namespace detail
diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h
index 4bfe87c9de44..2a70e64da066 100644
--- a/aten/src/ATen/TensorUtils.h
+++ b/aten/src/ATen/TensorUtils.h
@@ -157,6 +157,11 @@ TORCH_API c10::optional<std::vector<int64_t>> computeStride(
     IntArrayRef oldstride,
     IntArrayRef newshape);
 
+TORCH_API c10::optional<SymDimVector> computeStride(
+    c10::SymIntArrayRef oldshape,
+    c10::SymIntArrayRef oldstride,
+    c10::SymIntArrayRef newshape);
+
 TORCH_API c10::optional<DimVector> computeStride(
     IntArrayRef oldshape,
     IntArrayRef oldstride,
diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp
index fb589beaba89..422c1dcc6f0f 100644
--- a/aten/src/ATen/ThreadLocalState.cpp
+++ b/aten/src/ATen/ThreadLocalState.cpp
@@ -14,6 +14,7 @@ ThreadLocalState::ThreadLocalState()
       debug_info_(c10::ThreadLocalDebugInfo::current()),
       functorch_tls_(functorch::getCopyOfFuncTorchTLS()),
       autograd_tls_(c10::AutogradState::get_tls_state()),
+      python_dispatcher_state_(c10::impl::PythonDispatcherTLS::get_state()),
       python_torch_function_state_(at::impl::PythonTorchFunctionTLS::get_state()) {
   rf_tls_ = at::get_record_function_tls_();
 
@@ -41,6 +42,8 @@ void ThreadLocalState::setThreadLocalState(
 
   at::SavedTensorDefaultHooks::set_stack(state.saved_tensors_default_hooks_);
 
+  c10::impl::PythonDispatcherTLS::set_state(state.python_dispatcher_state_);
+
   c10::ThreadLocalDebugInfo::_forceCurrentDebugInfo(state.debug_info_);
 
   c10::impl::_force_tls_local_dispatch_key_set(state.dispatch_key_);
diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h
index a0067fb8aaeb..529c98b99723 100644
--- a/aten/src/ATen/ThreadLocalState.h
+++ b/aten/src/ATen/ThreadLocalState.h
@@ -10,6 +10,7 @@
 #include <ATen/FuncTorchTLS.h>
 #include <ATen/PythonTorchFunctionTLS.h>
 #include <ATen/record_function.h>
+#include <c10/core/impl/PythonDispatcherTLS.h>
 #include <c10/core/impl/TorchDispatchModeTLS.h>
 
 namespace at {
@@ -57,6 +58,9 @@ class TORCH_API ThreadLocalState {
   // TLS for enable_torch_dispatch_mode
   std::shared_ptr<SafePyObject> torch_dispatch_mode_state_;
 
+  // TLS for enable_python_dispatcher
+  c10::impl::PyInterpreter* python_dispatcher_state_;
+
   // TLS for __torch_function__ (mode and disable_torch_function)
   at::impl::PythonTorchFunctionTLS python_torch_function_state_;
 
diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index 396b9746754c..95f9029c8dfb 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -595,12 +595,6 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
   KERNEL_CPU(ADD_NS(linalg_tensorsolve), "linalg_tensorsolve", Tensor(const Tensor &, const Tensor &, at::OptionalIntArrayRef), fp32)
   KERNEL_CPU(ADD_NS(fake_quantize_per_tensor_affine), "fake_quantize_per_tensor_affine", Tensor (const Tensor &, double, int64_t, int64_t, int64_t), fp32)
 
-  m.impl(TORCH_SELECTIVE_NAME("aten::eig"),
-         TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
-                                 std::tuple<Tensor, Tensor> (const Tensor &, bool),
-                                 std::tuple<Tensor, Tensor> (const Tensor &, bool),
-                                 &ADD_NS(eig)>::type::call)));
-
   m.impl(TORCH_SELECTIVE_NAME("aten::geqrf"),
          TORCH_FN((&WrapFunction<CastPolicy::fp32, DeviceType::CPU,
                                  std::tuple<Tensor, Tensor> (const Tensor &),
diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp
index 06588a0a989d..381c8a45aca6 100644
--- a/aten/src/ATen/core/PythonFallbackKernel.cpp
+++ b/aten/src/ATen/core/PythonFallbackKernel.cpp
@@ -1,4 +1,5 @@
 #include <c10/core/impl/TorchDispatchModeTLS.h>
+#include <c10/core/impl/PythonDispatcherTLS.h>
 #include <ATen/core/PythonFallbackKernel.h>
 #include <c10/core/SafePyObject.h>
 
@@ -87,6 +88,12 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
   TORCH_INTERNAL_ASSERT(0, "Hit Python dispatch key but no arguments had PyInterpreter (no tensor args?)");
 }
 
+void pythonDispatcherFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
+  auto* state = c10::impl::PythonDispatcherTLS::get_state();
+  TORCH_INTERNAL_ASSERT(state, "Hit PythonDispatcher dispatch key but PythonDispatcherTLS was not set");
+  (*state)->python_dispatcher(op, dispatch_keys.remove(c10::DispatchKey::PythonDispatcher), stack);
+}
+
 void pythonTLSSnapshotFallback(const c10::OperatorHandle &op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
   // It is ok for the tls to be already set here.
   // It means that there are multiple calls into the dispatcher not originating from python code.
@@ -134,6 +141,10 @@ TORCH_LIBRARY_IMPL(_, Python, m) {
   m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonFallback>());
 }
 
+TORCH_LIBRARY_IMPL(_, PythonDispatcher, m) {
+  m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonDispatcherFallback>());
+}
+
 TORCH_LIBRARY_IMPL(_, PythonTLSSnapshot, m) {
   m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonTLSSnapshotFallback>());
 }
diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h
index e6dd73658efc..3f35d3a71de4 100644
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@@ -302,6 +302,10 @@ class TORCH_API TensorBase {
     return impl_->sym_numel();
   }
 
+  c10::SymInt sym_storage_offset() const {
+    return impl_->sym_storage_offset();
+  }
+
   // Length of one array element in bytes.  This is the traditional
   // Numpy naming.
   size_t itemsize() const {
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
index bc40bc5b62e0..1ea677b54ef5 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -168,6 +168,12 @@ class TORCH_API Dispatcher final {
   // See Note [Plumbing Keys Through The Dispatcher]
   void redispatchBoxed(const OperatorHandle& op, DispatchKeySet dispatchKeySet, Stack* stack) const;
 
+  bool hasBackendFallbackForDispatchKey(DispatchKey dk) {
+    auto dispatch_ix = getDispatchTableIndexForDispatchKey(dk);
+    if (dispatch_ix < 0) return false;
+    return backendFallbackKernels_[dispatch_ix].kernel.isValid();
+  }
+
 
   // ------------------------------------------------------------------------
   //
@@ -333,6 +339,10 @@ class TORCH_API OperatorHandle {
     return operatorDef_->op.hasKernelForDispatchKey(k);
   }
 
+  bool hasKernelForAnyDispatchKey(DispatchKeySet k) const {
+    return operatorDef_->op.hasKernelForAnyDispatchKey(k);
+  }
+
   bool hasComputedKernelForDispatchKey(DispatchKey k) const {
     return operatorDef_->op.hasComputedKernelForDispatchKey(k);
   }
@@ -388,6 +398,11 @@ class TORCH_API OperatorHandle {
     c10::Dispatcher::singleton().redispatchBoxed(*this, ks, stack);
   }
 
+  template <typename F>
+  PyObject* getPythonOp(c10::impl::PyInterpreter* self_interpreter, F slow_accessor) const {
+    return operatorDef_->op.getPythonOp(self_interpreter, slow_accessor);
+  }
+
 private:
   explicit OperatorHandle(std::list<Dispatcher::OperatorDef>::iterator operatorIterator)
   : operatorDef_(&*operatorIterator), operatorIterator_(operatorIterator)  {}
@@ -635,11 +650,18 @@ inline void Dispatcher::callBoxedForDispatchKey(const OperatorHandle& op, Dispat
   // We still compute this as we're obligated to pass it on to the internal
   // kernel, if it is a boxed fallback
   auto dispatchKeySet = entry.dispatchKeyExtractor().getDispatchKeySetBoxed(stack);
-  const auto& kernel = entry.kernelForDispatchKey(dk);
+  const auto& kernel = ([&]() {
+    if (op.hasKernelForDispatchKey(dk)) {
+      return entry.kernelForDispatchKey(dk);
+    } else {
+      auto idx = getDispatchTableIndexForDispatchKey(dk);
+      TORCH_INTERNAL_ASSERT(idx >= 0);
+      return backendFallbackKernels_[idx].kernel;
+    }
+  })();
   kernel.callBoxed(op, dispatchKeySet, stack);
 }
 
-
 inline void Dispatcher::redispatchBoxed(const OperatorHandle& op, DispatchKeySet dispatchKeySet, Stack* stack) const {
   // note: this doesn't need the mutex because write operations on the list keep iterators intact.
   const auto& entry = op.operatorDef_->op;
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index 01d30c888db2..5d53500e7dfe 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -329,10 +329,8 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
   // to let the original CompositeImplicitAutograd handle Undefined
   if (dispatch_key != DispatchKey::Undefined && isIncludedInAlias(dispatch_key, DispatchKey::CompositeImplicitAutogradNestedTensor)) {
     if (auto nested_registration = getKernelForDispatchKey(DispatchKey::CompositeImplicitAutogradNestedTensor)) {
-      if (!has_backend_kernel) {
-        return {*nested_registration, "nested kernel"};
+      return {*nested_registration, "nested kernel"};
       }
-    }
   }
 
   if (dispatch_key == DispatchKey::Undefined || isIncludedInAlias(dispatch_key, DispatchKey::CompositeImplicitAutograd)) {
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h
index a964423d6aa8..c3bd91197f5e 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.h
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.h
@@ -6,6 +6,7 @@
 #include <c10/util/either.h>
 #include <c10/util/Optional.h>
 #include <c10/core/DispatchKey.h>
+#include <c10/core/PyHandleCache.h>
 #include <ATen/core/ivalue.h>
 #include <ATen/core/boxing/KernelFunction.h>
 #include <ATen/core/dispatch/DispatchKeyExtractor.h>
@@ -211,6 +212,11 @@ class TORCH_API OperatorEntry final {
   // Returns all the operator tags added at the time of registration
   const std::vector<at::Tag>& getTags() const;
 
+  template <typename F>
+  PyObject* getPythonOp(PyInterpreter* self_interpreter, F slow_accessor) const {
+    return py_cache_.ptr_or(self_interpreter, slow_accessor);
+  }
+
 private:
 
   OperatorName name_;
@@ -220,6 +226,8 @@ class TORCH_API OperatorEntry final {
   #endif
   std::array<KernelFunction, c10::num_runtime_entries> dispatchTable_;
   DispatchKeyExtractor dispatchKeyExtractor_;
+  // Pointer to the torch.ops.ns.op.overload object for speed
+  c10::PyHandleCache py_cache_;
 
   // kernels_ stores all registered kernels for the corresponding dispatch key
   // and catchAllKernels_ stores the catch-all kernels.
diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h
index 14f134939d76..315ceaec19ac 100644
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@@ -550,7 +550,10 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
   // in schema, we have Tensor?(a!) input, and t(a!)?.
   // however, t?(a!) doesn't work with schema parser.
   // so we always use Type(alias)? format
-  auto type = arg.type();
+  // real_type versus fake_type: in order to be compatible with FunctionSchema
+  // parser, printing an argument with either MemoryFormat or Layout type should
+  // give us the original schema string, hence printing out real_type.
+  auto type = arg.real_type();
   bool is_opt = type->kind() == OptionalType::Kind;
   auto unopt_type = is_opt ? type->castRaw<OptionalType>()->getElementType() : type;
 
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 8d0199b3c954..9793730d1977 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -565,8 +565,6 @@ struct TORCH_API IValue final {
     }
   }
 
-  IValue(c10::SymIntArrayRef v);
-
   bool isSymInt() const {
     return Tag::SymInt == tag;
   }
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
index 00361c80a01c..270732c5e163 100644
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -1999,7 +1999,6 @@ inline IValue::IValue(at::ArrayRef<T> v) : IValue(c10::List<T>()) {
     list.push_back(e);
   }
 }
-inline IValue::IValue(c10::SymIntArrayRef v) : IValue(at::ArrayRef<c10::SymInt>(v.data(), v.size())) {}
 template <class T, IValue::enable_if_ivalue_constructible<T>>
 inline IValue::IValue(const std::vector<T>& v) : IValue(c10::List<T>()) {
   auto list = to<c10::List<T>>();
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index ce698761dad7..86f784b0c8f4 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -2114,7 +2114,7 @@ struct MemoryFormatType;
 using MemoryFormatTypePtr = SingletonTypePtr<MemoryFormatType>;
 struct TORCH_API MemoryFormatType : public EnumerationType<TypeKind::MemoryFormatType> {
 std::string str() const override {
-return "MemoryFormatType";
+return "MemoryFormat";
 }
 static const TypeKind Kind = TypeKind::MemoryFormatType;
 // global singleton
@@ -2128,7 +2128,7 @@ struct LayoutType;
 using LayoutTypePtr = SingletonTypePtr<LayoutType>;
 struct TORCH_API LayoutType : public EnumerationType<TypeKind::LayoutType> {
 std::string str() const override {
-return "LayoutType";
+return "Layout";
 }
 static const TypeKind Kind = TypeKind::LayoutType;
 // global singleton
diff --git a/aten/src/ATen/cuda/Atomic.cuh b/aten/src/ATen/cuda/Atomic.cuh
index 03cabf8de73f..42975411e841 100644
--- a/aten/src/ATen/cuda/Atomic.cuh
+++ b/aten/src/ATen/cuda/Atomic.cuh
@@ -164,6 +164,7 @@ Atomic##NAME##IntegerImpl<DTYPE, sizeof(DTYPE)>()(address,
 }                                                                                                                      \
 
 ATOMIC_INTEGER_IMPL(Add)
+GPU_ATOMIC_INTEGER(Add, a || b, bool)
 
 // Don't instantiate gpuAtomicAdd with the macro as it seems non-standard (see int32, int64)
 static inline __device__ void gpuAtomicAdd(uint8_t *address, uint8_t val) {
@@ -206,10 +207,6 @@ static inline __device__ void gpuAtomicAdd(int64_t *address, int64_t val) {
 #endif
 }
 
-static inline __device__ void gpuAtomicAdd(bool *address, bool val) {
-  *address = address && val;
-}
-
 static inline  __device__ at::Half gpuAtomicAdd(at::Half *address, at::Half val) {
 #if defined(USE_ROCM) || ((defined(CUDA_VERSION) && CUDA_VERSION < 10000) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
   return AtomicFPOp<at::Half>()(address, val,
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index e1a01ceb6829..866f53ee7f87 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -709,11 +709,9 @@ void gemm_and_bias(
   CuBlasLtMatrixLayout Cdesc(abcType, m, n, result_ld);
 
   CuBlasLtMatmulPreference preference;
-  // See https://github.com/pytorch/pytorch/issues/73328.
-  // Check https://docs.nvidia.com/cuda/cublas/index.html#cublassetworkspace .
-  // Recommended size of user-provided workspace is at least 4MiB (to match
-  // cuBLAS' default workspace pool).
-  size_t workspaceSize = 4 * 1024 * 1024;
+  // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind
+  // setting this to 1M.
+  size_t workspaceSize = 1024 * 1024;
   TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(
       preference.descriptor(),
       CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
diff --git a/aten/src/ATen/cuda/CUDAEvent.h b/aten/src/ATen/cuda/CUDAEvent.h
index 8e83d82b5255..1c3c67949e58 100644
--- a/aten/src/ATen/cuda/CUDAEvent.h
+++ b/aten/src/ATen/cuda/CUDAEvent.h
@@ -157,6 +157,10 @@ struct TORCH_CUDA_CPP_API CUDAEvent {
   // Note: cudaEventSynchronize can be safely called from any device
   void synchronize() const {
     if (is_created_) {
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+          (*interp)->trace_gpu_event_synchronization(reinterpret_cast<uintptr_t>(event_));
+      }
       AT_CUDA_CHECK(cudaEventSynchronize(event_));
     }
   }
diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
index 0cac5d6da2d5..a678354dca49 100644
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
@@ -231,7 +231,8 @@ uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const {
  * offset_extragraph is the initial offset at the start of the graphed region.
  * offset_intragraph tracks the offset in the graphed region.
  */
-void CUDAGeneratorImpl::capture_prologue(int64_t* offset_extragraph) {
+void CUDAGeneratorImpl::capture_prologue(int64_t* seed_extragraph, int64_t* offset_extragraph) {
+  seed_extragraph_ = seed_extragraph;
   offset_extragraph_ = offset_extragraph;
   offset_intragraph_ = 0;
   graph_expects_this_gen_ = true;
@@ -279,7 +280,7 @@ PhiloxCudaState CUDAGeneratorImpl::philox_cuda_state(uint64_t increment) {
     TORCH_INTERNAL_ASSERT(this->offset_intragraph_ <=
                           std::numeric_limits<uint32_t>::max() - increment);
     this->offset_intragraph_ += increment;
-    return PhiloxCudaState(this->seed_,
+    return PhiloxCudaState(this->seed_extragraph_,
                            this->offset_extragraph_,
                            offset);
   } else {
diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.h b/aten/src/ATen/cuda/CUDAGeneratorImpl.h
index 768f0b7549c2..60130b884719 100644
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.h
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.h
@@ -100,7 +100,7 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
   c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
   void set_philox_offset_per_thread(uint64_t offset);
   uint64_t philox_offset_per_thread() const;
-  void capture_prologue(int64_t* offset_extragraph);
+  void capture_prologue(int64_t* seed_extragraph, int64_t* offset_extragraph);
   uint64_t capture_epilogue();
   PhiloxCudaState philox_cuda_state(uint64_t increment);
 
@@ -114,6 +114,7 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
   CUDAGeneratorImpl* clone_impl() const override;
   uint64_t seed_ = default_rng_seed_val;
   uint64_t philox_offset_per_thread_ = 0;
+  int64_t* seed_extragraph_{};
   int64_t* offset_extragraph_{};
   uint32_t offset_intragraph_ = 0;
   bool graph_expects_this_gen_ = false;
diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp
index c7734334f4e2..583918e9fc08 100644
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@@ -65,9 +65,11 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/) {
       c10::nullopt, cuda::detail::getDefaultCUDAGenerator());
 
   auto options = TensorOptions().device(at::kCUDA).dtype(at::kLong);
+  seed_extragraph_ = at::empty({1}, options);
   offset_extragraph_ = at::empty({1}, options);
 
-  gen->capture_prologue(offset_extragraph_.data_ptr<int64_t>());
+  seed_extragraph_.fill_(int64_t(gen->current_seed()));
+  gen->capture_prologue(seed_extragraph_.data_ptr<int64_t>(), offset_extragraph_.data_ptr<int64_t>());
 
   auto stream = at::cuda::getCurrentCUDAStream();
 
@@ -175,6 +177,7 @@ void CUDAGraph::replay() {
     std::lock_guard<std::mutex> lock(gen->mutex_);
     rng_engine_inputs = gen->philox_cuda_state(wholegraph_increment_);
   }
+  seed_extragraph_.fill_(int64_t(gen->current_seed()));
   offset_extragraph_.fill_(int64_t(rng_engine_inputs.offset_.val));
 
   // graph_exec_ may be replayed in any stream.
diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h
index 09b0b7b5d800..bacad79102a3 100644
--- a/aten/src/ATen/cuda/CUDAGraph.h
+++ b/aten/src/ATen/cuda/CUDAGraph.h
@@ -69,6 +69,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
   int capture_dev_;
 
   // RNG state trackers
+  at::Tensor seed_extragraph_;
   at::Tensor offset_extragraph_;
   uint64_t wholegraph_increment_;
 };
diff --git a/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh b/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
index e14680f88793..a9b67b41ac45 100644
--- a/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
+++ b/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
@@ -13,14 +13,14 @@ struct PhiloxCudaState {
   // Called if graph capture is not underway
   PhiloxCudaState(uint64_t seed,
                   uint64_t offset) {
-    seed_ = seed;
+    seed_.val = seed;
     offset_.val = offset;
   }
   // Called if graph capture is underway
-  PhiloxCudaState(uint64_t seed,
+  PhiloxCudaState(int64_t* seed,
                   int64_t* offset_extragraph,
                   uint32_t offset_intragraph) {
-    seed_ = seed;
+    seed_.ptr = seed;
     offset_.ptr = offset_extragraph;
     offset_intragraph_ = offset_intragraph;
     captured_ = true;
@@ -34,7 +34,7 @@ struct PhiloxCudaState {
     int64_t* ptr;
   };
 
-  uint64_t seed_ = 0;
+  Payload seed_;
   Payload offset_;
   uint32_t offset_intragraph_ = 0;
   bool captured_ = false;
diff --git a/aten/src/ATen/cuda/detail/UnpackRaw.cuh b/aten/src/ATen/cuda/detail/UnpackRaw.cuh
index e6746fbe4fd0..f8fa4ebbf160 100644
--- a/aten/src/ATen/cuda/detail/UnpackRaw.cuh
+++ b/aten/src/ATen/cuda/detail/UnpackRaw.cuh
@@ -21,9 +21,9 @@ unpack(at::PhiloxCudaState arg) {
     // static_cast avoids "warning: invalid narrowing conversion from "long" to "unsigned long".
     // *(arg.offset_.ptr) is a broadcast load of a single int64_t to the entire kernel.
     // For most threads' reads it will hit in cache, so it shouldn't hurt performance.
-    return std::make_tuple(arg.seed_, static_cast<uint64_t>(*(arg.offset_.ptr) + arg.offset_intragraph_));
+    return std::make_tuple(static_cast<uint64_t>(*arg.seed_.ptr), static_cast<uint64_t>(*(arg.offset_.ptr) + arg.offset_intragraph_));
   } else {
-    return std::make_tuple(arg.seed_, arg.offset_.val);
+    return std::make_tuple(arg.seed_.val, arg.offset_.val);
   }
 }
 
diff --git a/aten/src/ATen/cuda/llvm_complex.cpp b/aten/src/ATen/cuda/llvm_complex.cpp
index d88bdc4ce657..0bb2c2ba9a09 100644
--- a/aten/src/ATen/cuda/llvm_complex.cpp
+++ b/aten/src/ATen/cuda/llvm_complex.cpp
@@ -48,6 +48,10 @@ class complex
     void real(value_type __re) {__re_ = __re;}
     void imag(value_type __im) {__im_ = __im;}
 
+    constexpr operator bool() const {
+        return real() || imag();
+    }
+
     complex& operator= (const value_type& __re)
         {__re_ = __re; __im_ = value_type(); return *this;}
     complex& operator+=(const value_type& __re) {__re_ += __re; return *this;}
@@ -106,6 +110,10 @@ class complex<float>
     void real(value_type __re) {__re_ = __re;}
     void imag(value_type __im) {__im_ = __im;}
 
+    constexpr operator bool() const {
+        return real() || imag();
+    }
+
     complex& operator= (float __re)
         {__re_ = __re; __im_ = value_type(); return *this;}
     complex& operator+=(float __re) {__re_ += __re; return *this;}
@@ -162,6 +170,10 @@ class complex<double>
     void real(value_type __re) {__re_ = __re;}
     void imag(value_type __im) {__im_ = __im;}
 
+    constexpr operator bool() const {
+        return real() || imag();
+    }
+
     complex& operator= (double __re)
         {__re_ = __re; __im_ = value_type(); return *this;}
     complex& operator+=(double __re) {__re_ += __re; return *this;}
@@ -482,7 +494,15 @@ inline constexpr
 bool
 operator&&(const complex<_Tp>& __x, const complex<_Tp>& __y)
 {
-    return (__x.real() || __x.imag()) && (__y.real() || __y.imag());
+    return bool(__x) && bool(__y);
+}
+
+template<class _Tp>
+inline constexpr
+bool
+operator||(const complex<_Tp>& __x, const complex<_Tp>& __y)
+{
+    return bool(__x) || bool(__y);
 }
 
 // 26.3.7 values:
diff --git a/functorch/functorch/csrc/BatchedTensorImpl.cpp b/aten/src/ATen/functorch/BatchedTensorImpl.cpp
similarity index 59%
rename from functorch/functorch/csrc/BatchedTensorImpl.cpp
rename to aten/src/ATen/functorch/BatchedTensorImpl.cpp
index 58d8bfdde6af..c5d6eb34030d 100644
--- a/functorch/functorch/csrc/BatchedTensorImpl.cpp
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.cpp
@@ -3,51 +3,19 @@
 //
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
-#include <functorch/csrc/BatchedTensorImpl.h>
+#include <ATen/functorch/BatchedTensorImpl.h>
 
 #include <ATen/WrapDimUtils.h>
 #include <c10/util/Exception.h>
 
-#include <functorch/csrc/Constants.h>
 #include <c10/util/irange.h>
 
 namespace at {
 namespace functorch {
 
-BatchedTensorImpl::BatchedTensorImpl(Tensor value, int64_t bdim, int64_t level)
-  : TensorImpl(
-      c10::DispatchKeySet(kBatchedKey),
-      value.dtype(),
-      value.device()
-    )
-  , value_(std::move(value))
-  , level_(level)
-  , bdim_(bdim)
-{
-  // TODO: I don't think this ctor gets used.
-  TORCH_INTERNAL_ASSERT(false);
-  TORCH_INTERNAL_ASSERT(value_.defined());
-  set_storage_access_should_throw();
-  set_sizes_strides_policy(SizesStridesPolicy::CustomStrides);
-  checkInvariants();
-
-  const auto public_dims = value_.dim() - 1;
-  const auto value_sizes = value_.sizes();
-  const auto value_strides = value_.strides();
-  sizes_and_strides_.resize(public_dims);
-  for (const auto dim : c10::irange(0, public_dims)) {
-    auto actual_dim = actualDim(dim, /*wrap_dim=*/false);
-    sizes_and_strides_.size_at_unchecked(dim) = value_sizes.at(actual_dim);
-    sizes_and_strides_.stride_at_unchecked(dim) = value_strides.at(actual_dim);
-  }
-  storage_offset_= value_.storage_offset();
-  refresh_numel();
-  refresh_contiguous();
-}
-
 BatchedTensorImpl::BatchedTensorImpl(DispatchKeySet key_set, Tensor value, int64_t bdim, int64_t level)
   : TensorImpl(
-      key_set.add(kBatchedKey),
+      key_set.add(DispatchKey::FuncTorchBatched),
       value.dtype(),
       value.device()
     )
@@ -57,7 +25,7 @@ BatchedTensorImpl::BatchedTensorImpl(DispatchKeySet key_set, Tensor value, int64
 {
   TORCH_INTERNAL_ASSERT(value_.defined());
   set_storage_access_should_throw();
-  set_sizes_strides_policy(SizesStridesPolicy::CustomStrides);
+  set_custom_sizes_strides(SizesStridesPolicy::CustomStrides);
   checkInvariants();
   refreshTensorMetadata();
 }
@@ -82,36 +50,11 @@ int64_t BatchedTensorImpl::actualDim(int64_t dim, bool wrap_dim) const {
     const auto ndim = sizes_and_strides_.size();
     dim = maybe_wrap_dim(dim, ndim);
   }
-  auto is_bdim = createBatchDimBitset(bdim_);
-
-  // TODO(vfdev): As BatchedTensorImpl is refactored and has only one dim.
-  // Below code may be simplified.
-
-  // Example: assume dim = 3, and is_bdim = 10010011000...
-  // The 1's are batch dims and 0's are normal dims of the underlying value_ Tensor.
-  // actualDim gives us the index of `dim` in the `value_` Tensor, which is equivalent
-  // to asking "where does the 3rd (0-indexed) zero occur in the bitset?".
-  // The answer to that is index 5.
-  //
-  // TODO(rzou): the PDEP instruction does exactly this
-  // (https://stackoverflow.com/questions/7669057/find-nth-set-bit-in-an-int)
-  // but it might require newer (>= ~2015) CPUs. We should clean this up
-  // if/when we have dropped support for older CPUs.
-  int64_t non_bdim_count = 0;
-  for (int64_t actual_dim = 0; actual_dim < kVmapMaxTensorDims; actual_dim++) {
-    if (is_bdim[actual_dim]) {
-      continue;
-    }
-    if (non_bdim_count == dim) {
-      return actual_dim;
-    }
-    non_bdim_count++;
+  if (bdim_ <= dim) {
+    return dim + 1;
+  } else {
+    return dim;
   }
-  // If we hit this assert, then that means
-  // `non_bdim_count` + #num_bdims > kVmapMaxTensorDims. We restrict the number
-  // of dims a BatchedTensorImpl can have to kVmapMaxTensorDims so this should
-  // never be hit.
-  TORCH_INTERNAL_ASSERT(false);
 }
 
 void BatchedTensorImpl::checkInvariants() const {
diff --git a/functorch/functorch/csrc/BatchedTensorImpl.h b/aten/src/ATen/functorch/BatchedTensorImpl.h
similarity index 83%
rename from functorch/functorch/csrc/BatchedTensorImpl.h
rename to aten/src/ATen/functorch/BatchedTensorImpl.h
index 3d422d68491e..320989604570 100644
--- a/functorch/functorch/csrc/BatchedTensorImpl.h
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.h
@@ -12,9 +12,6 @@
 #include <ATen/SmallVector.h>
 #include <ATen/Tensor.h>
 
-#include <functorch/csrc/Macros.h>
-#include <functorch/csrc/Constants.h>
-
 namespace at {
 namespace functorch {
 
@@ -43,8 +40,7 @@ constexpr int64_t kBatchDimsStackSize = 5;
 //
 // bt.sizes() returns (5, 7); bt.sum(0) performs a reduction over the (public)
 // dim 0, which is equivalent to dim 3 in the underlying ones(2, 3, 5, 7) tensor.
-struct BatchedTensorImpl : public c10::TensorImpl {
-  explicit BatchedTensorImpl(Tensor value, int64_t dim, int64_t level);
+struct TORCH_API BatchedTensorImpl : public c10::TensorImpl {
   explicit BatchedTensorImpl(at::DispatchKeySet key_set, Tensor value, int64_t dim, int64_t level);
 
   // Returns batch dimension of this tensor
@@ -79,9 +75,16 @@ struct BatchedTensorImpl : public c10::TensorImpl {
 #endif
 
   void refreshTensorMetadata();
+
+  // Used in torchdim. torchdim uses non-lexical BatchedTensor; the way it
+  // accomplishes this is a hack where it is able to modify the levels of
+  // BatchedTensor to match the level of the current vmap transform.
   void _unsafe_set_level(int64_t level) {
     level_ = level;
   }
+
+  // Used in batching rule for in-place view operations that can change
+  // the index of the bdim (think squeeze_, unsqueeze_)
   void unsafe_set_bdim(int64_t bdim) {
     // NB: you MUST call refreshTensorMetadata after doing this.
     bdim_ = bdim;
@@ -100,7 +103,7 @@ struct BatchedTensorImpl : public c10::TensorImpl {
 // NB: We use the term "BatchedTensor" to mean a Tensor that is backed with a
 // BatchedTensorImpl.
 inline bool isBatchedTensor(const Tensor& tensor) {
-  return tensor.unsafeGetTensorImpl()->key_set().has(kBatchedKey);
+  return tensor.unsafeGetTensorImpl()->key_set().has(DispatchKey::FuncTorchBatched);
 }
 
 // It is unsafe to call this on a Tensor that is not backed by a
@@ -131,11 +134,15 @@ inline std::bitset<kVmapNumLevels> createVmapLevelsBitset(int64_t level) {
 }
 
 // Use this to construct a BatchedTensor from a regular Tensor
-FUNCTORCH_API Tensor makeBatched(const Tensor& tensor, int64_t dim, int64_t level);
+TORCH_API Tensor makeBatched(const Tensor& tensor, int64_t dim, int64_t level);
 
 // Adds a batch dim to `tensor`, returning a BatchedTensor
-FUNCTORCH_API Tensor addBatchDim(const Tensor& tensor, int64_t dim, int64_t level);
+TORCH_API Tensor addBatchDim(const Tensor& tensor, int64_t dim, int64_t level);
 
+// Certain dispatch keys must be propagated to the BatchedTensor (or, in general,
+// any wrapper Tensor subclasses). This is because there are methods on Tensor
+// that skip dispatch and check for the presence of a dispatch key (e.g. is_cpu()).
+// TODO: should probably contain more (or all?) backend keys
 constexpr DispatchKeySet kKeysToPropagateToWrapper({
   DispatchKey::Negative,
   DispatchKey::Conjugate,
diff --git a/aten/src/ATen/mps/MPSFallback.mm b/aten/src/ATen/mps/MPSFallback.mm
index 4f9e635dce05..75092867fa01 100644
--- a/aten/src/ATen/mps/MPSFallback.mm
+++ b/aten/src/ATen/mps/MPSFallback.mm
@@ -14,7 +14,7 @@ void mps_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack)
 
 void mps_error_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack)
 {
-  TORCH_CHECK_NOT_IMPLEMENTED(false, "The operator '", op.schema().operator_name(), "' is not current implemented ",
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "The operator '", op.schema().operator_name(), "' is not currently implemented ",
     "for the MPS device. If you want this op to be added in priority during the prototype ",
     "phase of this feature, please comment on https://github.com/pytorch/pytorch/issues/77764. ",
     "As a temporary fix, you can set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` ",
diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
index 97f504b85dd1..e7aec5e08a40 100644
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@@ -314,7 +314,7 @@ bool use_mkldnn(const Tensor& input) {
   if (!at::globalContext().userEnabledMkldnn()) {
     return false;
   }
-  if (!input.is_contiguous() || input.numel() == 1) {
+  if (!input.is_contiguous() || input.numel() <= 1) {
     return false;
   }
   return (input.is_mkldnn()) || // input is mkldnn Tensor
diff --git a/aten/src/ATen/native/AdaptiveAveragePooling.cpp b/aten/src/ATen/native/AdaptiveAveragePooling.cpp
index 855d54eadba8..0057f58b07d9 100644
--- a/aten/src/ATen/native/AdaptiveAveragePooling.cpp
+++ b/aten/src/ATen/native/AdaptiveAveragePooling.cpp
@@ -106,7 +106,7 @@ namespace {
       return at::mkldnn_adaptive_avg_pool2d(input, output_size);
     }
 
-    if (!input.is_quantized() && output_size[0] == 1 && output_size[1] == 1) {
+    if (!input.is_quantized() && output_size[0] == 1 && output_size[1] == 1 && !input.is_xpu()) {
       // in this case, adaptive pooling is just computing mean over hw
       // dimensions, which can be done more efficiently
       #if defined(C10_MOBILE) && defined(USE_XNNPACK)
diff --git a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
index 71f45daeebac..06257b42cd96 100644
--- a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
+++ b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
@@ -306,7 +306,7 @@ Tensor adaptive_avg_pool3d(Tensor const& input, IntArrayRef output_size) {
         "adaptive_avg_pool2d: elements of output_size must be greater than or equal to 0 ",
         "but received {", output_size[0], ", ", output_size[1], ",", output_size[2], "}");
 
-  if (output_size[0] == 1 && output_size[1] == 1 && output_size[2] == 1) {
+  if (output_size[0] == 1 && output_size[1] == 1 && output_size[2] == 1 && !input.is_xpu()) {
     // in this case, adaptive pooling is just computing mean over hw
     // dimensions, which can be done more efficiently
     Tensor out = input.mean({-1, -2, -3}, /* keepdim = */ true);
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index 7464e12fd7d3..09bffa1a7438 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -3168,66 +3168,6 @@ Tensor linalg_eigvals(const Tensor& input) {
   return values;
 }
 
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ eig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-DEFINE_DISPATCH(eig_stub);
-
-std::tuple<Tensor&, Tensor&> eig_out(const Tensor& self, bool eigenvectors, Tensor& e, Tensor& v) {
-  TORCH_WARN_ONCE(
-    "torch.eig is deprecated in favor of torch.linalg.eig and will be removed in a future ",
-    "PyTorch release.\n",
-    "torch.linalg.eig returns complex tensors of dtype cfloat or cdouble rather than real tensors ",
-    "mimicking complex tensors.\n",
-    "L, _ = torch.eig(A)\n",
-    "should be replaced with\n",
-    "L_complex = torch.linalg.eigvals(A)\n",
-    "and\n",
-    "L, V = torch.eig(A, eigenvectors=True)\n",
-    "should be replaced with\n",
-    "L_complex, V_complex = torch.linalg.eig(A)"
-  );
-  TORCH_CHECK(self.dim() == 2, "input should be 2 dimensional");
-  TORCH_CHECK(self.size(0) == self.size(1), "input should be square");
-  TORCH_CHECK(self.isfinite().all().item<bool>(), "input should not contain infs or NaNs");
-  checkSameDevice("torch.eig", e, self, "eigenvalues");
-  checkLinalgCompatibleDtype("torch.eig", e, self, "eigenvalues");
-  if (eigenvectors) {
-    checkSameDevice("torch.eig", v, self, "eigenvectors");
-    checkLinalgCompatibleDtype("torch.eig", v, self, "eigenvectors");
-  }
-  int64_t n = self.size(-1);
-
-  if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
-      at::native::resize_output(e, {n});
-  } else {
-      at::native::resize_output(e, {n, 2});
-  }
-  if (eigenvectors) {
-      at::native::resize_output(v, self.sizes());
-  }
-
-  // optimization: if self is empty, we can immediately return the empty
-  // tensors, instead of getting empty tensors from eig_helper
-  if (self.numel() == 0) {
-      return std::tuple<Tensor&, Tensor&>(e, v);
-  }
-
-  Tensor vals_, vecs_;
-  std::tie(vals_, vecs_) = eig_stub(self.device().type(), self, eigenvectors);
-  e.copy_(vals_);
-  if (eigenvectors) {
-    v.copy_(vecs_);
-  }
-  return std::tuple<Tensor&, Tensor&>(e, v);
-}
-
-std::tuple<Tensor,Tensor> eig(const Tensor& self, bool eigenvectors) {
-  Tensor e = at::empty({0}, self.options());
-  Tensor v = at::empty({0}, self.options());
-  at::eig_out(e, v, self, eigenvectors);
-  return std::tuple<Tensor, Tensor>(e, v);
-}
-
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ linalg_svd ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /* torch.svd, implemented in terms of torch.linalg.svd. There are two main
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.h b/aten/src/ATen/native/BatchLinearAlgebra.h
index 531595f3544e..a86be95f40bd 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.h
+++ b/aten/src/ATen/native/BatchLinearAlgebra.h
@@ -231,10 +231,6 @@ using cholesky_inverse_fn = Tensor& (*)(Tensor& /*result*/, Tensor& /*infos*/, b
 
 DECLARE_DISPATCH(cholesky_inverse_fn, cholesky_inverse_stub);
 
-using eig_fn = std::tuple<Tensor, Tensor> (*)(const Tensor&, bool&);
-
-DECLARE_DISPATCH(eig_fn, eig_stub);
-
 using linalg_eig_fn = void (*)(Tensor& /*eigenvalues*/, Tensor& /*eigenvectors*/, Tensor& /*infos*/, const Tensor& /*input*/, bool /*compute_eigenvectors*/);
 
 DECLARE_DISPATCH(linalg_eig_fn, linalg_eig_stub);
diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
index 5b18dbe2d5fa..3fe9fc137697 100644
--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
@@ -127,87 +127,6 @@ Tensor& cholesky_inverse_kernel_impl(Tensor& result, Tensor& infos, bool upper)
   return result;
 }
 
-template <typename scalar_t>
-void apply_eig(const Tensor& self, bool eigenvectors, Tensor& vals_, Tensor& vecs_, int* info_ptr) {
-#if !AT_BUILD_WITH_LAPACK()
-  TORCH_CHECK(false, "Calling torch.eig on a CPU tensor requires compiling ",
-    "PyTorch with LAPACK. Please use PyTorch built with LAPACK support.");
-#else
-  using value_t = typename c10::scalar_value_type<scalar_t>::type;
-
-  char jobvr = eigenvectors ? 'V' : 'N';
-  int64_t n = self.size(-1);
-  auto self_data = self.data_ptr<scalar_t>();
-
-  auto vals_data = vals_.data_ptr<scalar_t>();
-  scalar_t* wr = vals_data;
-
-  scalar_t* vecs_data = eigenvectors ? vecs_.data_ptr<scalar_t>() : nullptr;
-  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-  int ldvr = eigenvectors ? n : 1;
-
-  Tensor rwork;
-  value_t* rwork_data = nullptr;
-  if (self.is_complex()) {
-    ScalarType real_dtype = toRealValueType(typeMetaToScalarType(self.dtype()));
-    rwork = at::empty({n*2}, self.options().dtype(real_dtype));
-    rwork_data = rwork.data_ptr<value_t>();
-  }
-
-  if (n > 0) {
-    // call lapackEig once to get the optimal size for work data
-    scalar_t wkopt;
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    lapackEig<scalar_t, value_t>('N', jobvr, n, self_data, n, wr,
-      nullptr, 1, vecs_data, ldvr, &wkopt, -1, rwork_data, info_ptr);
-    int lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
-
-    // call again to do the actual work
-    Tensor work = at::empty({lwork}, self.dtype());
-    lapackEig<scalar_t, value_t>('N', jobvr, n, self_data, n, wr,
-      nullptr, 1, vecs_data, ldvr, work.data_ptr<scalar_t>(), lwork, rwork_data, info_ptr);
-  }
-#endif
-}
-
-std::tuple<Tensor, Tensor> eig_kernel_impl(const Tensor& self, bool& eigenvectors) {
-  int64_t n = self.size(-1);
-  // lapackEig function expects the input to be column major, or stride {1, n},
-  // so we must set the stride manually since the default stride for tensors is
-  // row major, {n, 1}
-  Tensor self_ = at::empty_strided(
-      {n, n},
-      {1, n},
-      at::TensorOptions(self.dtype()));
-  self_.copy_(self);
-
-  auto options = self.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-
-  // the API is slightly different for the complex vs real case: if the input
-  // is complex, eigenvals will be a vector of complex. If the input is real,
-  // eigenvals will be a (n, 2) matrix containing the real and imaginary parts
-  // in each column
-  Tensor vals_;
-  if (self.is_complex()) {
-      vals_ = at::empty({n}, options);
-  } else {
-      vals_ = at::empty_strided({n, 2}, {1, n}, options);
-  }
-  Tensor vecs_ = eigenvectors
-                 ? at::empty_strided({n, n}, {1, n}, options)
-                 : Tensor();
-
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  auto infos = at::zeros({}, self.options().dtype(kInt));
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "eig_cpu", [&]{
-    apply_eig<scalar_t>(self_, eigenvectors, vals_, vecs_, infos.data_ptr<int>());
-  });
-  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
-  at::_linalg_check_errors(infos, "eig", /*is_matrix*/true);
-
-  return std::tuple<Tensor, Tensor>(vals_, vecs_);
-}
-
 /*
   Computes the eigenvalues and eigenvectors of n-by-n matrix 'input'.
   This is an in-place routine, content of 'input', 'values', 'vectors' is overwritten.
@@ -1200,12 +1119,6 @@ REGISTER_AVX2_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl);
 REGISTER_VSX_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl);
 REGISTER_ZVECTOR_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl);
 
-REGISTER_ARCH_DISPATCH(eig_stub, DEFAULT, &eig_kernel_impl);
-REGISTER_AVX512_DISPATCH(eig_stub, &eig_kernel_impl);
-REGISTER_AVX2_DISPATCH(eig_stub, &eig_kernel_impl);
-REGISTER_VSX_DISPATCH(eig_stub, &eig_kernel_impl);
-REGISTER_ZVECTOR_DISPATCH(eig_stub, &eig_kernel_impl);
-
 REGISTER_ARCH_DISPATCH(linalg_eig_stub, DEFAULT, &linalg_eig_kernel);
 REGISTER_AVX512_DISPATCH(linalg_eig_stub, &linalg_eig_kernel);
 REGISTER_AVX2_DISPATCH(linalg_eig_stub, &linalg_eig_kernel);
diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index 255f86ca1a30..f6d61076dec5 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -179,24 +179,6 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
   return result;
 }
 
-namespace {
-
-bool einsum_check_label(unsigned char label) {
-  return std::isalpha(label);
-}
-
-uint8_t einsum_label_to_index(unsigned char label) {
-  constexpr uint8_t NUM_OF_LETTERS = 'z' - 'a' + 1;
-  return std::isupper(label) ? label - 'A' : NUM_OF_LETTERS + (label - 'a');
-}
-
-unsigned char einsum_index_to_label(uint8_t index) {
-  constexpr uint8_t NUM_OF_LETTERS = 'z' - 'a' + 1;
-  return index < NUM_OF_LETTERS ? index + 'A' : index - NUM_OF_LETTERS + 'a';
-}
-
-} // namespace
-
 // There are roughly three parts to compute einsum:
 // 1. Parse equation to extract the labels for each input operand and output
 // 2. Unsqueeze missing dimensions from input operands and permute to align them
@@ -205,8 +187,22 @@ unsigned char einsum_index_to_label(uint8_t index) {
 Tensor einsum(c10::string_view equation, TensorList operands) {
   TORCH_CHECK(!operands.empty(), "einsum(): must provide at least one operand");
 
+  // Labels must be in range [A-Za-z]
+  constexpr uint8_t NUM_OF_LETTERS = 'z' - 'a' + 1;
+  constexpr uint8_t TOTAL_LABELS = NUM_OF_LETTERS * 2;
+
   // Code used to identify ELLIPSIS ("...")
-  constexpr uint8_t ELLIPSIS = 52;
+  constexpr uint8_t ELLIPSIS = TOTAL_LABELS;
+
+  // Convert label in [A-Za-z] to subscript in [0, TOTAL_LABELS)
+  auto label_to_subscript = [=](unsigned char label) -> uint8_t {
+    return std::isupper(label) ? label - 'A' : label - 'a' + NUM_OF_LETTERS;
+  };
+
+  // Convert subscript in [0, TOTAL_LABELS) to label in [A-Za-z]
+  auto subscript_to_label = [=](uint8_t s) -> unsigned char {
+    return s < NUM_OF_LETTERS ? s + 'A' : s + 'a' - NUM_OF_LETTERS;
+  };
 
   // Find arrow (->) to split equation into lhs and rhs
   const auto arrow_pos = equation.find("->");
@@ -255,11 +251,11 @@ Tensor einsum(c10::string_view equation, TensorList operands) {
       default:
         // Parse label
         TORCH_CHECK(
-            einsum_check_label(label),
+            std::isalpha(label),
             "einsum(): invalid subscript given at index ",
             i,
             " in the equation string, subscripts must be in [a-zA-Z]");
-        op_labels[curr_op].push_back(einsum_label_to_index(label));
+        op_labels[curr_op].push_back(label_to_subscript(label));
     }
   }
 
@@ -267,8 +263,6 @@ Tensor einsum(c10::string_view equation, TensorList operands) {
       curr_op == num_ops - 1,
       "einsum(): more operands were provided than specified in the equation");
 
-  // Labels must be within [a-zA-Z].
-  constexpr uint8_t TOTAL_LABELS = 52;
   std::vector<int64_t> label_count(TOTAL_LABELS, 0);
 
   // The maximum number of dimensions covered by any ellipsis, needed when
@@ -354,11 +348,11 @@ Tensor einsum(c10::string_view equation, TensorList operands) {
 
         default:
           TORCH_CHECK(
-              einsum_check_label(label),
+              std::isalpha(label),
               "einsum(): invalid subscript given at index ",
-            lhs.size() + 2 + i,
+              lhs.size() + 2 + i,
               " in the equation string, subscripts must be in [a-zA-Z]");
-          const auto index = einsum_label_to_index(label);
+          const auto index = label_to_subscript(label);
           TORCH_CHECK(
               // Ensure label appeared at least once for some input operand and at
               // most once for the output
@@ -420,7 +414,7 @@ Tensor einsum(c10::string_view equation, TensorList operands) {
         TORCH_CHECK(
             operand.size(j) == operand.size(dim),
             "einsum(): subscript ",
-            einsum_index_to_label(label),
+            subscript_to_label(label),
             " is repeated for operand ",
             i,
             " but the sizes don't match, ",
diff --git a/aten/src/ATen/native/MaxUnpooling.cpp b/aten/src/ATen/native/MaxUnpooling.cpp
index 27d4e1a93c81..33cc4dc7a61c 100644
--- a/aten/src/ATen/native/MaxUnpooling.cpp
+++ b/aten/src/ATen/native/MaxUnpooling.cpp
@@ -11,6 +11,10 @@ Tensor& max_unpooling2d_forward_out_cpu(
     const Tensor& indices_,
     IntArrayRef output_size,
     Tensor& output) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic with duplicate indices
+  at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out");
+
   auto oheight = output_size[0];
   auto owidth = output_size[1];
   TORCH_CHECK(
@@ -149,6 +153,10 @@ Tensor& max_unpooling3d_forward_out_cpu(const Tensor& self_,
     IntArrayRef stride,
     IntArrayRef padding,
     Tensor& output) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic with duplicate indices
+  at::globalContext().alertNotDeterministic("max_unpooling3d_forward_out");
+
   TORCH_CHECK(output.is_contiguous(), "output must be contiguous");
   int64_t oT = output_size[0];
   int64_t oH = output_size[1];
diff --git a/aten/src/ATen/native/MetaTensor.cpp b/aten/src/ATen/native/MetaTensor.cpp
index a58b18c786e8..e29317c25870 100644
--- a/aten/src/ATen/native/MetaTensor.cpp
+++ b/aten/src/ATen/native/MetaTensor.cpp
@@ -12,7 +12,7 @@
 namespace at {
 namespace native {
 
-Tensor empty_meta(
+Tensor empty_meta_symint(
   SymIntArrayRef size,
   c10::optional<ScalarType> dtype_opt,
   c10::optional<Layout> layout_opt,
@@ -29,6 +29,7 @@ Tensor empty_meta(
       size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
 }
 
+// Kept only for BC with XLA
 Tensor empty_strided_meta(
   IntArrayRef size,
   IntArrayRef stride,
@@ -37,7 +38,18 @@ Tensor empty_strided_meta(
   c10::optional<Device> device_opt,
   c10::optional<bool> pin_memory_opt
 ) {
-  return at::detail::empty_strided_meta(
+  return empty_strided_meta_symint(c10::fromIntArrayRef(size), c10::fromIntArrayRef(stride), dtype_opt, layout_opt, device_opt, pin_memory_opt);
+}
+
+Tensor empty_strided_meta_symint(
+  SymIntArrayRef size,
+  SymIntArrayRef stride,
+  c10::optional<ScalarType> dtype_opt,
+  c10::optional<Layout> layout_opt,
+  c10::optional<Device> device_opt,
+  c10::optional<bool> pin_memory_opt
+) {
+  return at::detail::empty_strided_symint_meta(
       size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
 }
 
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 34d906b7adc4..a67377f047d7 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -14,6 +14,7 @@
 #include <c10/util/irange.h>
 
 #include <vector>
+#include <c10/core/SymIntArrayRef.h>
 
 static const int MIOPEN_DIM_MAX = 5;
 
@@ -41,7 +42,7 @@ DEFINE_DISPATCH(batch_norm_cpu_backward_stub);
 DEFINE_DISPATCH(renorm_scale_factor_stub);
 
 namespace {
-  void check_dims_match_num_input_features(const char* arg_name, int64_t expected, int64_t actual){
+  void check_dims_match_num_input_features(const char* arg_name, SymInt expected, SymInt actual){
     TORCH_CHECK(actual == expected,
              arg_name, " should contain ", expected, " elements not ", actual);
   }
@@ -443,14 +444,14 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
   const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
   const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
 
-  auto num_features = input.sizes()[1];
+  auto num_features = input.sym_sizes()[1];
 
-  if (input.numel() == 0) {
+  if (input.sym_numel() == 0) {
     Tensor reserve = at::empty({0}, input.options().dtype(kByte));
     auto options = input.options().dtype(
         at::toAccumulateType(input.scalar_type(), /*is_cuda=*/input.is_cuda()));
-    auto save_mean = at::empty({num_features}, options);
-    auto save_invstd = at::empty({num_features}, options);
+    auto save_mean = at::empty_symint(c10::SymIntArrayRef({num_features}), options);
+    auto save_invstd = at::empty_symint(c10::SymIntArrayRef({num_features}), options);
 
     // don't return view of input, don't return empty tensor because it will break gradient chain
     auto out = input.clone();
@@ -461,20 +462,20 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
   }
 
   if (running_mean.defined()) {
-    check_dims_match_num_input_features("running_mean", num_features, running_mean.numel());
+    check_dims_match_num_input_features("running_mean", num_features, running_mean.sym_numel());
   } else if (!training) {
     AT_ERROR("running_mean must be defined in evaluation mode");
   }
   if (running_var.defined()) {
-    check_dims_match_num_input_features("running_var", num_features, running_var.numel());
+    check_dims_match_num_input_features("running_var", num_features, running_var.sym_numel());
   } else if (!training) {
     AT_ERROR("running_var must be defined in evaluation mode");
   }
   if (weight.defined()) {
-    check_dims_match_num_input_features("weight", num_features, weight.numel());
+    check_dims_match_num_input_features("weight", num_features, weight.sym_numel());
   }
   if (bias.defined()) {
-    check_dims_match_num_input_features("bias", num_features, bias.numel());
+    check_dims_match_num_input_features("bias", num_features, bias.sym_numel());
   }
 
   const bool use_cudnn = (
diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md
index cfce94a36c0e..01a25e3a978c 100644
--- a/aten/src/ATen/native/README.md
+++ b/aten/src/ATen/native/README.md
@@ -628,7 +628,8 @@ the torch._C._nn (marked with `python_module: nn`),
 torch._C._fft (marked with `python_module: fft`),
 torch._C._linalg (marked with `python_module: linalg`) objects,
 torch._C._sparse (marked with `python_module: sparse`) objects,
-or torch._C._special (marked with `python_module: special`) objects.
+torch._C._special (marked with `python_module: special`) objects,
+or torch._C._nested (marked with `python_module: nested`) objects.
 
 ### Undefined tensor conventions
 
diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp
index e40caef80e3c..b195422ff862 100644
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@@ -717,7 +717,7 @@ struct GRUCell : Cell<Tensor, cell_params> {
       const hidden_type& hidden,
       const cell_params& params,
       bool pre_compute_input = false) const override {
-    if (input.is_cuda()) {
+    if (input.is_cuda() || input.is_xpu()) {
       TORCH_CHECK(!pre_compute_input);
       auto igates = params.matmul_ih(input);
       auto hgates = params.matmul_hh(hidden);
diff --git a/aten/src/ATen/native/ReflectionPad.cpp b/aten/src/ATen/native/ReflectionPad.cpp
index db744cc95eb0..7824de63805f 100644
--- a/aten/src/ATen/native/ReflectionPad.cpp
+++ b/aten/src/ATen/native/ReflectionPad.cpp
@@ -965,8 +965,8 @@ TORCH_IMPL_FUNC(reflection_pad3d_out_cpu)
   auto input = input_.contiguous();
 
   if (batch_mode) {
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(
-        kHalf, input.scalar_type(), "reflection_pad3d_cpu", [&] {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+        kHalf, kBFloat16, input.scalar_type(), "reflection_pad3d_cpu", [&] {
           auto input_data = input.data_ptr<scalar_t>();
           auto output_data = output.data_ptr<scalar_t>();
           auto nbatch = input.size(0);
@@ -986,8 +986,8 @@ TORCH_IMPL_FUNC(reflection_pad3d_out_cpu)
               pad_front);
         });
   } else {
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(
-        kHalf, input.scalar_type(), "reflection_pad3d_cpu", [&] {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+        kHalf, kBFloat16, input.scalar_type(), "reflection_pad3d_cpu", [&] {
           auto input_data = input.data_ptr<scalar_t>();
           auto output_data = output.data_ptr<scalar_t>();
           reflection_pad3d_out_frame(
@@ -1043,8 +1043,8 @@ TORCH_IMPL_FUNC(reflection_pad3d_backward_out_cpu)(const Tensor& grad_output,
   grad_input.zero_();
 
   if (batch_mode) {
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(
-        kHalf, input.scalar_type(), "reflection_pad3d_backward_cpu", [&] {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+        kHalf, kBFloat16, input.scalar_type(), "reflection_pad3d_backward_cpu", [&] {
           reflection_pad3d_backward_out_loop<scalar_t>(
               grad_input.data_ptr<scalar_t>(),
               grad_output_.data_ptr<scalar_t>(),
@@ -1061,8 +1061,8 @@ TORCH_IMPL_FUNC(reflection_pad3d_backward_out_cpu)(const Tensor& grad_output,
               pad_front);
         });
   } else {
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(
-        kHalf, input.scalar_type(), "reflection_pad3d_backward_cpu", [&] {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+        kHalf, kBFloat16, input.scalar_type(), "reflection_pad3d_backward_cpu", [&] {
           reflection_pad3d_backward_out_frame<scalar_t>(
               grad_input.data_ptr<scalar_t>(),
               grad_output_.data_ptr<scalar_t>(),
diff --git a/aten/src/ATen/native/Resize.h b/aten/src/ATen/native/Resize.h
index c6fe2b3d2146..84fd6c6e6196 100644
--- a/aten/src/ATen/native/Resize.h
+++ b/aten/src/ATen/native/Resize.h
@@ -83,20 +83,30 @@ inline TensorImpl* resize_impl_cpu_(
   return self;
 }
 
+template <typename T>
+T maybe_convert_symint(c10::SymInt) = delete;
+
+template <>
+inline c10::SymInt maybe_convert_symint(c10::SymInt x) { return x; }
+
+template <>
+inline int64_t maybe_convert_symint(c10::SymInt x) { return x.expect_int(); }
+
+template <typename T>
 static inline void checkInBoundsForStorage(
-    IntArrayRef size,
-    IntArrayRef stride,
-    int64_t storage_offset,
+    ArrayRef<T> size,
+    ArrayRef<T> stride,
+    T storage_offset,
     const caffe2::TypeMeta data_type,
     const Storage& new_storage) {
-  int64_t storage_size_bytes =
+  T storage_size_bytes =
       at::detail::computeStorageNbytes(size, stride, data_type.itemsize());
-  int64_t storage_offset_bytes = storage_offset * data_type.itemsize();
+  T storage_offset_bytes = storage_offset * data_type.itemsize();
   if (storage_size_bytes == 0) {
     // NB: (a tensor with arbitrary 0 dims)'s storage can have any numel.
     return;
   }
-  int64_t new_storage_size_bytes = new_storage.nbytes();
+  T new_storage_size_bytes = maybe_convert_symint<T>(new_storage.sym_nbytes());
   TORCH_CHECK(
       storage_size_bytes + storage_offset_bytes <= new_storage_size_bytes,
       "setStorage: sizes ",
@@ -151,11 +161,12 @@ static inline void checkSetStorage(Tensor& result, Storage storage, int64_t stor
  * Set self's sizes, strides, and storage_offset.
  * (size, stride, storage_offset) must be in bounds for self's storage.
  */
+template <typename T>
 inline void setStrided(
     const Tensor& self,
-    IntArrayRef size,
-    IntArrayRef stride,
-    int64_t storage_offset) {
+    ArrayRef<T> size,
+    ArrayRef<T> stride,
+    T storage_offset) {
   TORCH_CHECK(size.size() == stride.size(), "mismatch in length of strides and shape");
   for (auto val : stride) {
     TORCH_CHECK(val >= 0,
@@ -169,13 +180,7 @@ inline void setStrided(
 
   /* storage offset */
   TORCH_CHECK(storage_offset >= 0, "Tensor: invalid storage offset ", storage_offset);
-  self_->set_storage_offset(storage_offset);
-
-  /* size and stride */
-  if (self_->sizes() == size && self_->strides() == stride) {
-    return;
-  }
-  self_->set_sizes_and_strides(size, stride);
+  self_->set_sizes_and_strides(size, stride, c10::make_optional(storage_offset));
 }
 
 }}
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index f263c2ce2389..101803c71d75 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -1095,8 +1095,6 @@ Tensor & index_select_out_cpu_(const Tensor & self, int64_t dim, const Tensor &
   TORCH_CHECK(index.scalar_type() == ScalarType::Long || index.scalar_type() == ScalarType::Int, "index_select(): Expected dtype int32 or int64 for index");
   TORCH_CHECK(self.scalar_type() == result.scalar_type(),
               "index_select(): self and result must have the same scalar type");
-  TORCH_CHECK(dim == 0 || dim < self.dim(),
-              "index_select(): Indexing dim ", dim, " is out of bounds of tensor");
   at::assert_no_internal_overlap(result);
   at::assert_no_overlap(result, self);
   at::assert_no_overlap(result, index);
diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
index c28fe272f861..ea8474960264 100644
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@@ -1470,7 +1470,7 @@ Tensor sparse_compressed_to_sparse(const Tensor& self) {
 // Sparse layout conversions End
 
 Tensor to_meta(const Tensor& tensor) {
-  auto out = at::native::empty_strided_meta(tensor.sizes(), tensor.strides(), \
+  auto out = at::native::empty_strided_meta_symint(tensor.sym_sizes(), tensor.sym_strides(), \
 /*dtype=*/c10::make_optional(tensor.scalar_type()), /*layout=*/c10::make_optional(tensor.layout()), \
 /*device=*/c10::make_optional(c10::Device(c10::kMeta)), /*pin_memory=*/c10::nullopt);
   // needs to handle wrapped numbers, so dtype promotion works properly.
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 6ccbbbac03a7..2e01f7e8699a 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -214,12 +214,9 @@ Tensor empty_strided_cpu(IntArrayRef size, IntArrayRef stride, c10::optional<Sca
   return at::detail::empty_strided_cpu(size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
 }
 
-Tensor& empty_out(SymIntArrayRef sym_size,
+Tensor& empty_out(IntArrayRef size,
     c10::optional<c10::MemoryFormat> optional_memory_format,
     Tensor& result) {
-  // TODO: support empty_out properly (I was forced to change this immediately
-  // with empty so that empty/empty.out had the same type signature)
-  auto size = c10::asIntArrayRefSlow(sym_size);
   // Preferably, this argument would not be accepted by _out, but the code
   // generator requires the out and non-out overloads to match exactly
   TORCH_CHECK(
@@ -386,7 +383,7 @@ Tensor empty_like_quantized(
   }
 }
 
-Tensor new_empty(
+Tensor new_empty_symint(
     const Tensor& self,
     SymIntArrayRef size,
     c10::optional<ScalarType> dtype_opt,
@@ -401,10 +398,10 @@ Tensor new_empty(
   return at::empty_symint(size, dtype, layout, device, pin_memory, c10::nullopt);
 }
 
-Tensor new_empty_strided(
+Tensor new_empty_strided_symint(
     const Tensor& self,
-    IntArrayRef size,
-    IntArrayRef stride,
+    c10::SymIntArrayRef size,
+    c10::SymIntArrayRef stride,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
     c10::optional<Device> device,
@@ -413,7 +410,7 @@ Tensor new_empty_strided(
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
-  return at::empty_strided(size, stride, self.options().merge_in(options));
+  return at::empty_strided_symint(size, stride, self.options().merge_in(options));
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ eye ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1077,7 +1074,7 @@ Tensor triu_indices_cpu(
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ zeros ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Tensor zeros(SymIntArrayRef size,
+Tensor zeros_symint(SymIntArrayRef size,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
     c10::optional<Device> device,
@@ -1107,8 +1104,7 @@ Tensor& zeros_sparse_out(IntArrayRef size, Tensor& result) {
   return result;
 }
 
-Tensor& zeros_out(SymIntArrayRef sym_size, Tensor& result) {
-  auto size = c10::asIntArrayRefSlow(sym_size);
+Tensor& zeros_out(IntArrayRef size, Tensor& result) {
   if (result.is_sparse()) {
     // TODO: I think this branch should be dead, but we don't have an easy
     // way to cover all sparse kernels with zeros_sparse_out, so retain this
@@ -1483,7 +1479,7 @@ Tensor clone(const Tensor& src, c10::optional<c10::MemoryFormat> optional_memory
   if (memory_format == MemoryFormat::Preserve) {
     if (src.is_non_overlapping_and_dense()) {
       // Copy all strides, this is marginally faster than calling empty_like
-      self = at::empty_strided(src.sizes(), src.strides(), src.options());
+      self = at::empty_strided_symint(src.sym_sizes(), src.sym_strides(), src.options());
     } else {
       self = at::empty_like(src);
     }
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 3f0b40ba29b7..d72cc0b65293 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -429,6 +429,23 @@ Tensor concat(TensorList tensors, int64_t dim) {
   return at::cat(tensors, dim);
 }
 
+// torch.concatenate, alias for torch.cat
+Tensor& concatenate_out(TensorList tensors, Dimname dim, Tensor& result) {
+  return at::cat_out(result, tensors, dimname_to_position(tensors[0], dim));
+}
+
+Tensor concatenate(TensorList tensors, Dimname dim) {
+  return at::cat(tensors, dimname_to_position(tensors[0], dim));
+}
+
+Tensor& concatenate_out(TensorList tensors, int64_t dim, Tensor & result) {
+  return at::cat_out(result, tensors, dim);
+}
+
+Tensor concatenate(TensorList tensors, int64_t dim) {
+  return at::cat(tensors, dim);
+}
+
 static bool sizes_match_except(IntArrayRef s1, IntArrayRef s2, int64_t dim_except /* should already be wrapped */) {
   if (s1.size() != s2.size()) {
     return false;
@@ -844,9 +861,7 @@ Tensor diag_embed(const Tensor& self, int64_t offset, int64_t dim1_, int64_t dim
   return result;
 }
 
-Tensor expand(const Tensor& self, c10::SymIntArrayRef sym_size, bool /*unused*/) {
-  // TODO: properly support SymInt expand
-  auto size = asIntArrayRefSlow(sym_size);
+Tensor expand(const Tensor& self, c10::IntArrayRef size, bool /*unused*/) {
   TORCH_CHECK(size.size() >= (size_t)self.dim(),
            "expand(", self.toString(), "{", self.sizes(), "}, size=", size,
            "): the number of sizes provided (", size.size(), ") ",
@@ -882,6 +897,15 @@ Tensor make_qtensor(const Tensor& self, IntArrayRef size, IntArrayRef stride, Qu
 }
 
 Tensor as_strided_tensorimpl(const Tensor& self, IntArrayRef size, IntArrayRef stride, optional<int64_t> storage_offset_) {
+  TORCH_INTERNAL_ASSERT(!self.is_mps(), "as_strided_tensorimpl does not work with MPS; call self.as_strided(...) instead");
+  auto storage_offset = storage_offset_.value_or(self.storage_offset());
+  auto result = at::detail::make_tensor<TensorImpl>(
+      c10::TensorImpl::VIEW, Storage(self.storage()), self.key_set(), self.dtype());
+  setStrided(result, size, stride, storage_offset);
+  return result;
+}
+
+Tensor as_strided_tensorimpl_meta(const Tensor& self, IntArrayRef size, IntArrayRef stride, optional<int64_t> storage_offset_) {
   auto storage_offset = storage_offset_.value_or(self.storage_offset());
   auto result = at::detail::make_tensor<TensorImpl>(
       c10::TensorImpl::VIEW, Storage(self.storage()), self.key_set(), self.dtype());
@@ -925,9 +949,8 @@ const Tensor &as_strided_(const Tensor& self, IntArrayRef size, IntArrayRef stri
   return self;
 }
 
-Tensor narrow_copy_dense(const Tensor& self, int64_t dim, SymInt start, SymInt length) {
-  // TODO: properly support SymInt narrow_copy
-  return self.narrow(dim, start.expect_int(), length.expect_int()).clone(at::MemoryFormat::Contiguous);
+Tensor narrow_copy_dense(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
+  return self.narrow(dim, start, length).clone(at::MemoryFormat::Contiguous);
 }
 
 Tensor narrow_copy_dense_cpu(const Tensor& self, int64_t dim, int64_t start, int64_t length){
@@ -2105,6 +2128,10 @@ Tensor slice(
     auto quantizer = create_subtensor_quantizer(self, false, start_val, end_val, dim, step);
     result = as_strided_qtensorimpl(self, sizes, strides, storage_offset, quantizer);
   } else {
+    // NB: it is extremely important to perform a redispatch here for
+    // the MPS backend; if you call directly to as_strided_tensorimpl,
+    // the necessary metadata for MPS will not get setup and you will
+    // get silently wrong results
     result = self.as_strided(sizes, strides, storage_offset);
   }
   namedinference::propagate_names(result, self);
@@ -2644,15 +2671,15 @@ Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1) {
     return sparse_compressed_transpose(self, dim0, dim1);
   }
 
-  // Transpose of a tensor is a view operation.
-  if (dim0 == dim1) {
-    return self;
-  }
-
   if (self.is_mkldnn()) {
     return at::_mkldnn_transpose(self, dim0, dim1);
   }
 
+  // Transpose of a tensor is a view operation.
+  if (dim0 == dim1) {
+    return self.alias();
+  }
+
   DimVector sizes(self.sizes().begin(), self.sizes().end());
   std::swap(sizes[dim0], sizes[dim1]);
   DimVector strides(self.strides().begin(), self.strides().end());
@@ -3204,19 +3231,13 @@ Tensor adjoint(const Tensor &self) {
   return _adjoint(self, /*transpose=*/false, "adjoint()");
 }
 
-Tensor view_meta(const Tensor& self,
-            at::SymIntArrayRef size) {
-  // TODO: Properly support SymInt view
-  return view_impl(self, c10::asIntArrayRefSlow(size));
-}
-
 Tensor view(const Tensor& self,
             at::IntArrayRef size) {
   return view_impl(self, size);
 }
 
 Tensor alias(const Tensor& self) {
-    return alias_with_sizes_and_strides(self, self.sizes(), self.strides());
+  return alias_with_sizes_and_strides(self, self.sizes(), self.strides());
 }
 
 Tensor detach(const Tensor& self) {
@@ -3592,7 +3613,7 @@ at::Tensor& expand_copy_SymInt_out(const at::Tensor & self, c10::SymIntArrayRef
 }
 
 
-at::Tensor& expand_copy_out(const at::Tensor & self, at::SymIntArrayRef size, bool implicit, at::Tensor & out) {
+at::Tensor& expand_copy_out_symint(const at::Tensor & self, at::SymIntArrayRef size, bool implicit, at::Tensor & out) {
   auto tmp = self.expand_symint(size, implicit);
   out.copy_(tmp);
   return out;
@@ -3748,7 +3769,7 @@ void unbind_copy_int_out(const at::Tensor & self, int64_t dim, at::TensorList  o
 }
 
 
-at::Tensor& view_copy_out(const at::Tensor & self, at::SymIntArrayRef size, at::Tensor & out) {
+at::Tensor& view_copy_out_symint(const at::Tensor & self, at::SymIntArrayRef size, at::Tensor & out) {
   auto tmp = self.view_symint(size);
   out.copy_(tmp);
   return out;
diff --git a/aten/src/ATen/native/cpu/DistributionKernels.cpp b/aten/src/ATen/native/cpu/DistributionKernels.cpp
index 617809e14292..4363cc9d62e3 100644
--- a/aten/src/ATen/native/cpu/DistributionKernels.cpp
+++ b/aten/src/ATen/native/cpu/DistributionKernels.cpp
@@ -103,88 +103,11 @@ void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional<Gen
 }
 #endif
 
-static void exponential_kernel_default(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
+static void exponential_kernel(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::exponential_kernel(iter, lambda, generator);
 }
 
-#if !AT_MKL_ENABLED()
-void exponential_kernel(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
-  exponential_kernel_default(iter, lambda, gen);
-}
-#else
-void exponential_kernel(TensorIteratorBase &iter, double lambda, c10::optional<Generator> gen) {
-  Tensor self = iter.tensor(0);
-  if (lambda > 0 && !std::isinf(lambda) && !std::isnan(lambda) && cpuinfo_initialize() &&
-      cpuinfo_vendor_intel == cpuinfo_get_processor(0)->core->vendor) {
-    CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
-    int64_t seed;
-    {
-      // See Note [Acquire lock when using random generators]
-      std::lock_guard<std::mutex> lock(generator->mutex_);
-      if (self.scalar_type() == at::kDouble)
-        seed = generator->random64();
-      else
-        seed = generator->random();
-    }
-    int64_t n = self.numel();
-    bool contig = self.is_contiguous();
-
-    AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "exponential_cpu", [&] {
-      at::Tensor tmp_tensor;
-      constexpr bool is_df = std::is_same<scalar_t, float>::value || std::is_same<scalar_t, double>::value;
-      if (is_df && contig) {
-        tmp_tensor = self;
-      } else if (std::is_same<scalar_t, double>::value) {
-        tmp_tensor = at::empty(self.sizes(), self.options().dtype(at::kDouble));
-      } else {
-        tmp_tensor = at::empty(self.sizes(), self.options().dtype(at::kFloat));
-      }
-
-      scalar_t *self_ptr = self.data_ptr<scalar_t>();
-      using tmp_scalar_t = typename std::conditional_t<std::is_same<scalar_t, double>::value, double, float>;
-      tmp_scalar_t *sample_ptr = tmp_tensor.data_ptr<tmp_scalar_t>();
-
-      auto sample = [&](int64_t begin, int64_t end) {
-        int64_t len = end - begin;
-        if (len > 0) {
-          VSLStreamStatePtr stream;
-          if (std::is_same<scalar_t, double>::value) {
-            vslNewStream(&stream, VSL_BRNG_MCG31, seed);
-            vslSkipAheadStream(stream, begin);
-            vdRngExponential(VSL_RNG_METHOD_EXPONENTIAL_ICDF, stream, len,
-              (double *)(sample_ptr + begin), 0, 1./lambda);
-            vslDeleteStream(&stream);
-          } else {
-            vslNewStream(&stream, VSL_BRNG_MCG31, seed);
-            vslSkipAheadStream(stream, begin);
-            vsRngExponential(VSL_RNG_METHOD_EXPONENTIAL_ICDF, stream, len,
-              (float *) (sample_ptr + begin), 0, 1./lambda);
-            vslDeleteStream(&stream);
-          }
-          // vectorized copy if using buffer and contiguous
-          if (!is_df && contig) {
-            scalar_t *self_seg = self_ptr + begin;
-            tmp_scalar_t *tmp_seg = sample_ptr + begin;
-            at::vec::convert<tmp_scalar_t, scalar_t>(tmp_seg, self_seg, len);
-          }
-        }
-      };
-
-      parallel_for(0, n, /* grain_size= */ 800, sample);
-
-      // copy_ if using buffer and non contiguous
-      if (!contig) {
-        self.copy_(tmp_tensor);
-      }
-    });
-  } else {
-    // The situation of AMD, move to using the default version
-    exponential_kernel_default(iter, lambda, gen);
-  }
-}
-#endif
-
 static void geometric_kernel(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::geometric_kernel(iter, p, generator);
diff --git a/aten/src/ATen/native/cuda/AveragePool2d.cu b/aten/src/ATen/native/cuda/AveragePool2d.cu
index 55632014a0de..46e96e902981 100644
--- a/aten/src/ATen/native/cuda/AveragePool2d.cu
+++ b/aten/src/ATen/native/cuda/AveragePool2d.cu
@@ -32,8 +32,8 @@ __device__ inline int max(int a, int b) {
 
 template <typename scalar_t, typename accscalar_t>
 __global__ void avg_pool2d_out_cuda_frame(const int nthreads,
-    const scalar_t* const bottom_data, const int channels,
-    const int height, const int width, const int pooled_height,
+    const scalar_t* const bottom_data, const int64_t channels,
+    const int64_t height, const int64_t width, const int64_t pooled_height,
     const int pooled_width, const int kernel_h, const int kernel_w,
     const int stride_h, const int stride_w, const int pad_h, const int pad_w,
     scalar_t* const top_data, const int divisor_override,
@@ -81,8 +81,8 @@ __global__ void avg_pool2d_out_cuda_frame(const int nthreads,
 
 template <typename scalar_t, typename accscalar_t>
 __global__ void avg_pool2d_out_cuda_frame_nhwc(const int nthreads,
-    const scalar_t* const bottom_data, const int channels,
-    const int height, const int width, const int pooled_height,
+    const scalar_t* const bottom_data, const int64_t channels,
+    const int64_t height, const int64_t width, const int pooled_height,
     const int pooled_width, const int kernel_h, const int kernel_w,
     const int stride_h, const int stride_w, const int pad_h, const int pad_w,
     scalar_t* const top_data, const int divisor_override,
@@ -130,8 +130,8 @@ __global__ void avg_pool2d_out_cuda_frame_nhwc(const int nthreads,
 
 template <typename scalar_t, typename accscalar_t>
 __global__ void avg_pool2d_backward_out_cuda_frame(const int nthreads, const scalar_t* const top_diff,
-    const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
+    const int64_t channels, const int64_t height,
+    const int64_t width, const int64_t pooled_height, const int64_t pooled_width,
     const int kernel_h, const int kernel_w, const int stride_h,
     const int stride_w, const int pad_h, const int pad_w,
     scalar_t* const bottom_diff, const int divisor_override,
@@ -187,8 +187,8 @@ __global__ void avg_pool2d_backward_out_cuda_frame(const int nthreads, const sca
 template <typename scalar_t, typename accscalar_t>
 __global__ void avg_pool2d_backward_out_cuda_frame_nhwc(const int nthreads,
     const scalar_t* const top_diff,
-    const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
+    const int64_t channels, const int64_t height,
+    const int64_t width, const int pooled_height, const int pooled_width,
     const int kernel_h, const int kernel_w, const int stride_h,
     const int stride_w, const int pad_h, const int pad_w,
     scalar_t* const bottom_diff, const int divisor_override,
diff --git a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
index e69674412c79..cc6046c003e4 100644
--- a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
@@ -18,7 +18,7 @@ void logical_and_kernel_cuda(TensorIterator& iter) {
 #if AT_USE_JITERATOR()
     static const auto logical_and_string = jiterator_stringify(
         template <typename T>
-        T logical_and_kernel(T a, T b) {
+        bool logical_and_kernel(T a, T b) {
           return a && b;
         }
     ); // logical_and_string
@@ -48,24 +48,76 @@ void logical_and_kernel_cuda(TensorIterator& iter) {
   }
 }
 
+const char logical_or_name[] = "logical_or_kernel";
 void logical_or_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, ScalarType::BFloat16,
-                                         iter.common_dtype(), "logical_or_cuda", [&]() {
+  auto dtype = iter.common_dtype();
+  if (at::isComplexType(dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto logical_or_string = jiterator_stringify(
+      template <typename T>
+      bool logical_or_kernel(T a, T b) {
+        return a || b;
+      }
+    ); // logical_or_string
+    AT_DISPATCH_COMPLEX_TYPES(dtype, "logical_or_cuda", [&]() {
+      jitted_gpu_kernel<
+        /*name=*/ logical_or_name,
+        /*return_dtype=*/ scalar_t,
+        /*common_dtype=*/ scalar_t,
+        /*arity=*/ 2>(iter, logical_or_string);
+    });
+#else
+    AT_DISPATCH_COMPLEX_TYPES(dtype, "logical_or_cuda", [&]() {
+      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+        return a || b;
+      });
+    });
+#endif
+  } else {
+  AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, ScalarType::BFloat16,
+                             dtype, "logical_or_cuda", [&]() {
     opmath_symmetric_gpu_kernel_with_scalars<scalar_t, bool>(
         iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
       return a || b;
     });
   });
+  }
 }
 
+const char logical_xor_name[] = "logical_xor_kernel";
 void logical_xor_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, ScalarType::BFloat16,
-                                         iter.common_dtype(), "logical_xor_cuda", [&]() {
+  auto dtype = iter.common_dtype();
+  if (at::isComplexType(dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto logical_xor_string = jiterator_stringify(
+        template <typename T>
+        bool logical_xor_kernel(T a, T b) {
+          return bool(a) != bool(b);
+        }
+    );
+    AT_DISPATCH_COMPLEX_TYPES(dtype, "logical_xor_cuda", [&]() {
+      jitted_gpu_kernel<
+        /*name=*/ logical_xor_name,
+        /*return_dtype=*/ scalar_t,
+        /*common_dtype=*/ scalar_t,
+        /*arity=*/ 2>(iter, logical_xor_string);
+    }); // logical_xor_string
+#else
+    AT_DISPATCH_COMPLEX_TYPES(dtype, "logical_xor_cuda", [&]() {
+      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+        return bool(a) != bool(b);
+      });
+    });
+#endif
+  } else {
+  AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, ScalarType::BFloat16,
+                             dtype, "logical_xor_cuda", [&]() {
     opmath_symmetric_gpu_kernel_with_scalars<scalar_t, bool>(
         iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
       return bool(a) != bool(b);
     });
   });
+  }
 }
 
 REGISTER_DISPATCH(logical_and_stub, &logical_and_kernel_cuda);
diff --git a/aten/src/ATen/native/cuda/Col2Im.cu b/aten/src/ATen/native/cuda/Col2Im.cu
index fd57ec1f21d4..7b829b801a68 100644
--- a/aten/src/ATen/native/cuda/Col2Im.cu
+++ b/aten/src/ATen/native/cuda/Col2Im.cu
@@ -105,7 +105,7 @@ void col2im_out_cuda_template(
   output.zero_();
   int64_t output_batch_stride = output.stride(0);
 
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kHalf,
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
       input.scalar_type(), "col2im_out_cuda", [&] {
     int64_t height_col = (output_height + 2 * pad_height -
                           (dilation_height * (kernel_height - 1) + 1)) /
diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
index 05a201147241..dc1f771c9ab8 100644
--- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
@@ -44,8 +44,8 @@ static __device__ inline int p_end(int size, int pad, int pooled_size, int strid
 // kernels borrowed from Caffe
 template <typename scalar_t, typename accscalar_t>
 __global__ void max_pool_forward_nchw(const int nthreads, const scalar_t* bottom_data,
-    const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
+    const int64_t channels, const int64_t height,
+    const int64_t width, const int pooled_height, const int pooled_width,
     const int kernel_h, const int kernel_w, const int stride_h,
     const int stride_w, const int pad_h, const int pad_w,
     const int dilation_h, const int dilation_w, scalar_t* top_data,
@@ -83,8 +83,8 @@ __global__ void max_pool_forward_nchw(const int nthreads, const scalar_t* bottom
 template <typename scalar_t, typename accscalar_t>
 C10_LAUNCH_BOUNDS_1(CUDA_MAX_THREADS)
 __global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nbatch,
-                                   const int channels, const int height,
-                                   const int width, const int pooled_height, const int pooled_width,
+                                   const int64_t channels, const int64_t height,
+                                   const int64_t width, const int pooled_height, const int pooled_width,
                                    const int kernel_h, const int kernel_w, const int stride_h,
                                    const int stride_w, const int pad_h, const int pad_w,
                                    const int dilation_h, const int dilation_w,
@@ -176,8 +176,8 @@ C10_LAUNCH_BOUNDS_2(BLOCK_THREADS, 4)
 C10_LAUNCH_BOUNDS_2(BLOCK_THREADS, 8)
 #endif
 __global__ void max_pool_backward_nchw(const scalar_t* top_diff,
-    const int64_t* top_mask, const int num, const int channels,
-    const int height, const int width, const int pooled_height,
+    const int64_t* top_mask, const int num, const int64_t channels,
+    const int64_t height, const int64_t width, const int pooled_height,
     const int pooled_width, const int kernel_h, const int kernel_w,
     const int stride_h, const int stride_w, const int pad_h, const int pad_w,
     const int dilation_h, const int dilation_w,
@@ -209,8 +209,8 @@ __global__ void max_pool_backward_nchw(const scalar_t* top_diff,
 template <typename scalar_t, typename accscalar_t>
 C10_LAUNCH_BOUNDS_1(CUDA_MAX_THREADS)
 __global__ void max_pool_backward_nhwc(const scalar_t* top_diff,
-                                    const int64_t* top_mask, const int nbatch, const int channels,
-                                    const int height, const int width, const int pooled_height,
+                                    const int64_t* top_mask, const int nbatch, const int64_t channels,
+                                    const int64_t height, const int64_t width, const int pooled_height,
                                     const int pooled_width, const int kernel_h, const int kernel_w,
                                     const int stride_h, const int stride_w, const int pad_h, const int pad_w,
                                     const int dilation_h, const int dilation_w,
@@ -242,9 +242,9 @@ __global__ void max_pool_backward_nhwc(const scalar_t* top_diff,
   int iH = (height + gridDim.z-1) / gridDim.z;
   int iW = (width + gridDim.y-1) / gridDim.y;
   int istartH = threadIdx.z + blockIdx.z*iH;
-  int iendH = ::min(istartH+iH, height);
+  int iendH = ::min(static_cast<int64_t>(istartH)+iH, height);
   int istartW = threadIdx.y + blockIdx.y*iW;
-  int iendW = ::min(istartW+iW, width);
+  int iendW = ::min(static_cast<int64_t>(istartW)+iW, width);
 
   for (int ih = istartH; ih < iendH; ih+=blockDim.z) {
     int phstart = p_start(ih, pad_h, kernel_h, dilation_h, stride_h);
diff --git a/aten/src/ATen/native/cuda/Im2Col.cu b/aten/src/ATen/native/cuda/Im2Col.cu
index 89b2a1879b4b..5ca540964d52 100644
--- a/aten/src/ATen/native/cuda/Im2Col.cu
+++ b/aten/src/ATen/native/cuda/Im2Col.cu
@@ -106,7 +106,7 @@ static void im2col_out_cuda_template(
   output.zero_();
 
   // Launch kernel
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kHalf,
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
       input.scalar_type(), "im2col_out_cuda", [&] {
     Tensor input_n;
     Tensor output_n;
diff --git a/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp b/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
index cb6cacb3630f..f5816c8c6747 100644
--- a/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
+++ b/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
@@ -93,11 +93,6 @@ void lazy_linalg_eigh_kernel(const Tensor& eigenvalues, const Tensor& eigenvecto
   linalg_eigh_stub(DeviceType::CUDA, eigenvalues, eigenvectors, infos, upper, compute_eigenvectors);
 }
 
-std::tuple<Tensor, Tensor> lazy_eig_kernel(const Tensor& self, bool& eigenvectors) {
-  loadLazyTorchLinalgLibrary();
-  return eig_stub(DeviceType::CUDA, self, eigenvectors);
-}
-
 void lazy_linalg_eig_kernel(Tensor& eigenvalues, Tensor& eigenvectors, Tensor& infos, const Tensor& input, bool compute_eigenvectors) {
   getTorchLinalgLibrary();
   linalg_eig_stub(DeviceType::CUDA, eigenvalues, eigenvectors, infos, input, compute_eigenvectors);
@@ -155,7 +150,6 @@ REGISTER_CUDA_DISPATCH(orgqr_stub, &lazy_orgqr_kernel);
 REGISTER_CUDA_DISPATCH(ormqr_stub, &lazy_ormqr_kernel);
 REGISTER_CUDA_DISPATCH(geqrf_stub, &lazy_geqrf_kernel);
 REGISTER_CUDA_DISPATCH(linalg_eigh_stub, &lazy_linalg_eigh_kernel);
-REGISTER_CUDA_DISPATCH(eig_stub, &lazy_eig_kernel);
 REGISTER_CUDA_DISPATCH(linalg_eig_stub, &lazy_linalg_eig_kernel);
 REGISTER_CUDA_DISPATCH(svd_stub, &lazy_svd_kernel)
 REGISTER_CUDA_DISPATCH(lu_solve_stub, &lazy_lu_solve);
diff --git a/aten/src/ATen/native/cuda/MaxUnpooling.cu b/aten/src/ATen/native/cuda/MaxUnpooling.cu
index 9c24c4ea8edc..ba1a7eb1f5cb 100644
--- a/aten/src/ATen/native/cuda/MaxUnpooling.cu
+++ b/aten/src/ATen/native/cuda/MaxUnpooling.cu
@@ -118,6 +118,10 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
     const Tensor& indices_,
     IntArrayRef output_size,
     Tensor& output) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic with duplicate indices
+  at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out");
+
   TORCH_CHECK(output.is_contiguous(), "output must be contiguous");
   TORCH_CHECK(
       indices_.scalar_type() == at::ScalarType::Long,
@@ -291,6 +295,10 @@ Tensor& max_unpooling3d_forward_out_cuda(const Tensor& self_,
     IntArrayRef stride,
     IntArrayRef padding,
     Tensor& output) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic with duplicate indices
+  at::globalContext().alertNotDeterministic("max_unpooling3d_forward_out");
+
   TORCH_CHECK(output.is_contiguous(), "output must be contiguous");
   max_unpooling3d_shape_check(
     self_, Tensor(), indices_, output_size, stride, padding, "max_unpooling3d_forward_out_cuda()");
diff --git a/aten/src/ATen/native/cuda/ReflectionPad.cu b/aten/src/ATen/native/cuda/ReflectionPad.cu
index 33f71368ca10..5380b0fef5f2 100644
--- a/aten/src/ATen/native/cuda/ReflectionPad.cu
+++ b/aten/src/ATen/native/cuda/ReflectionPad.cu
@@ -335,7 +335,7 @@ void reflection_pad2d_out_template(
   int64_t size_y = nplane;
   int64_t size_z = nbatch;
 
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kHalf,
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
     input.scalar_type(), "reflection_pad2d_out_template", [&] {
 
       for (int64_t block_y = 0; block_y < size_y; block_y += 65535) {
@@ -407,7 +407,7 @@ void reflection_pad2d_backward_out_template(
   int64_t size_y = nplane;
   int64_t size_z = nbatch;
 
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kHalf,
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
     input.scalar_type(), "reflection_pad2d_backward_out_template", [&] {
 
       for (int64_t block_y = 0; block_y < size_y; block_y += 65535) {
@@ -463,8 +463,8 @@ TORCH_IMPL_FUNC(reflection_pad1d_out_cuda)
 
   Tensor input = input_.contiguous();
 
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(
-      kHalf, input.scalar_type(), "reflection_pad1d_out_template", [&] {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      kHalf, kBFloat16, input.scalar_type(), "reflection_pad1d_out_template", [&] {
         reflection_pad1d_out_kernel<<<
             grid_size,
             block_size,
@@ -520,7 +520,7 @@ TORCH_IMPL_FUNC(reflection_pad1d_backward_out_cuda)(const Tensor& grad_output_,
   dim3 block_size(output_w > 256 ? 256 : output_w);
   dim3 grid_size((int) ::ceil(output_w / 256.0), nplane, nbatch);
 
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kHalf,
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
     grad_input.scalar_type(), "reflection_pad1d_backward_out_cuda", [&] {
       reflection_pad1d_backward_out_kernel<<<
         grid_size, block_size, 0, at::cuda::getCurrentCUDAStream()>>>(
@@ -589,7 +589,7 @@ TORCH_IMPL_FUNC(reflection_pad3d_out_cuda) (
   auto input = input_.contiguous();
   bool batch_mode = (input.dim() == 5);
 
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kHalf,
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
       input.scalar_type(), "reflection_pad3d_out_cuda", [&] {
         auto input_inner = input;
         auto output_inner = output;
@@ -641,7 +641,7 @@ TORCH_IMPL_FUNC(reflection_pad3d_backward_out_cuda) (
   int64_t pad_top = padding[2];
   int64_t pad_front = padding[4];
 
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kHalf,
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
       input.scalar_type(), "reflection_pad3d_backward_out_cuda", [&] {
         auto grad_input_ = grad_input;
         auto grad_output_ = grad_output;
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
index 061e7e86de8b..a7d379ec4620 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
@@ -2036,96 +2036,6 @@ void linalg_eigh_kernel(const Tensor& eigenvalues, const Tensor& eigenvectors, c
 
 REGISTER_CUDA_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel);
 
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ eig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-// magmaEig uses a hybrid CPU-GPU algorithm, which takes and return CPU
-// memory. So, we accept a GPU tensor, copy it to CPU memory, and later copy
-// the returned values from CPU to GPU. See also magmaSymeig, which uses a
-// similar approach.
-
-template <typename scalar_t>
-static void apply_eig(const Tensor& self, bool eigenvectors, Tensor& out_eigvals, Tensor& out_eigvecs,
-                      int* info_ptr) {
-#if !AT_MAGMA_ENABLED()
-TORCH_CHECK(false, "Calling torch.eig on a CUDA tensor requires compiling PyTorch with MAGMA. "
-                   "Either transfer the tensor to the CPU before calling torch.eig or recompile with MAGMA.");
-#else
-  TORCH_INTERNAL_ASSERT(self.device() == at::kCPU, "Internal error: apply_eig needs a CPU tensor");
-  using value_t = typename c10::scalar_value_type<scalar_t>::type;
-  magma_vec_t jobvr = eigenvectors ? MagmaVec : MagmaNoVec;
-  magma_int_t n = magma_int_cast(self.size(-1), "n");
-  auto self_data = self.data_ptr<scalar_t>();
-
-  auto out_eigvals_data = out_eigvals.data_ptr<scalar_t>();
-  scalar_t *wr = out_eigvals_data;
-
-  scalar_t *vr_data = NULL;
-  magma_int_t ldvr = 1;
-  if (jobvr == MagmaVec)
-  {
-      vr_data = out_eigvecs.data_ptr<scalar_t>();
-      ldvr = n;
-  }
-
-  value_t *rwork_data = nullptr;
-  if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
-    ALLOCATE_ARRAY(rwork_data, value_t, n*2);
-  }
-
-  if (n > 0) {
-    // call magmaEig once to get the optimal size of work_data
-    scalar_t wkopt;
-    magma_int_t info;
-    magmaEig<scalar_t, value_t>(MagmaNoVec, jobvr, n, self_data, n, wr, NULL, 1, vr_data, ldvr, &wkopt, -1, rwork_data, &info);
-    magma_int_t lwork = static_cast<magma_int_t>(real_impl<scalar_t, value_t>(wkopt));
-
-    // call it a 2nd time to to the actual work
-    scalar_t *work_data = nullptr;
-    ALLOCATE_ARRAY(work_data, scalar_t, lwork);
-    magmaEig<scalar_t, value_t>(MagmaNoVec, jobvr, n, self_data, n, wr, NULL, 1, vr_data, ldvr, work_data, lwork, rwork_data, &info);
-    *info_ptr = info;
-  }
-#endif
-}
-
-/*
- * Internal helper; like eig_cuda but:
- *   1. assume that self is a square matrix of side "n"
- *   2. return CPU tensors (because this is what magmaEig returns), which will be copied to GPU memory
- *      by the caller
- */
-std::tuple<Tensor, Tensor> eig_kernel_impl(const Tensor& self, bool& eigenvectors) {
-  int64_t n = self.size(-1);
-  // copy self to pinned CPU memory
-  auto self_working_copy = at::empty_strided(
-      {n, n}, // square matrix
-      {1, n}, // column-ordered, as magmaEig expects
-      at::TensorOptions(at::kCPU).dtype(self.dtype()).pinned_memory(true));
-  self_working_copy.copy_(self);
-
-  // tensors holding the results. We use empty_strided to make them column-ordered
-  auto options = self.options().device(at::kCPU).memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  Tensor out_eigvals;
-  if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
-      out_eigvals = at::empty({n}, options);
-  } else {
-      out_eigvals = at::empty_strided({n, 2}, {1, n}, options);
-  }
-  auto out_eigvecs = eigenvectors
-                     ? at::empty_strided({n, n}, {1, n}, options)
-                     : Tensor();
-
-  auto infos = at::zeros({}, self_working_copy.options().dtype(kInt));
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "eig_cuda", [&]{
-    apply_eig<scalar_t>(self_working_copy, eigenvectors, out_eigvals, out_eigvecs, infos.data_ptr<int>());
-  });
-  at::_linalg_check_errors(infos, "eig", /*is_matrix*/true);
-
-  return std::tuple<Tensor, Tensor>(out_eigvals, out_eigvecs);
-}
-
-REGISTER_CUDA_DISPATCH(eig_stub, &eig_kernel_impl);
-
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ linalg_eig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /*
diff --git a/aten/src/ATen/native/metal/ops/MetalReshape.mm b/aten/src/ATen/native/metal/ops/MetalReshape.mm
index 1001b6690ad8..eca282a25bae 100644
--- a/aten/src/ATen/native/metal/ops/MetalReshape.mm
+++ b/aten/src/ATen/native/metal/ops/MetalReshape.mm
@@ -64,7 +64,7 @@ Tensor view(const Tensor& input, c10::SymIntArrayRef sym_size) {
 
 Tensor reshape(const Tensor& input, IntArrayRef shape) {
   TORCH_CHECK(input.is_metal());
-  return view(input, c10::SymIntArrayRef::fromIntArrayRef(shape));
+  return view(input, c10::fromIntArrayRef(shape));
 }
 
 Tensor flatten_using_ints(
diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
index b619307ef8aa..4a92048e4c4a 100644
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -72,16 +72,37 @@ void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha
 
           // this type inference is only required at the time of graph creation
           const ScalarType common_dtype = c10::promoteTypes(self.scalar_type(), other.scalar_type());
-          if (self.scalar_type() != common_dtype) {
-            primaryCastTensor = castMPSTensor(mpsGraph, newCachedGraph->primaryTensor, common_dtype);
+
+          // Condition -
+          // 1. Division operation
+          // 2. Inputs are not float
+          bool div_condition = op_name.rfind("div", 0) == 0
+                                  && (!(common_dtype == ScalarType::Float || common_dtype == ScalarType::Half));
+
+          auto compute_type = ScalarType::Float;
+
+          if(div_condition) {
+
+            if(output_.scalar_type() == ScalarType::Float || output_.scalar_type() == ScalarType::Half)
+              compute_type = output_.scalar_type();
+
+            primaryCastTensor = castMPSTensor(mpsGraph, newCachedGraph->primaryTensor, compute_type);
+            secondaryCastTensor = castMPSTensor(mpsGraph, newCachedGraph->secondaryTensor, compute_type);
           }
-          if (other.scalar_type() != common_dtype) {
-            secondaryCastTensor = castMPSTensor(mpsGraph, newCachedGraph->secondaryTensor, common_dtype);
+          else  {
+            if (self.scalar_type() != common_dtype) {
+              primaryCastTensor = castMPSTensor(mpsGraph, newCachedGraph->primaryTensor, common_dtype);
+            }
+            if (other.scalar_type() != common_dtype) {
+              secondaryCastTensor = castMPSTensor(mpsGraph, newCachedGraph->secondaryTensor, common_dtype);
+            }
           }
           newCachedGraph->outputTensor = binaryBlock(newCachedGraph, primaryCastTensor, secondaryCastTensor);
           // Cast output tensor to an expected type if needed, which addresses discrepancy when int64 scalar is added to int32 tensor
           // Output tensor should have been promoted but it remains an int32 tensor
-          if (output_.scalar_type() != common_dtype) {
+
+          if ((div_condition && compute_type != output_.scalar_type()) ||
+              output_.scalar_type() != common_dtype) {
             newCachedGraph->outputTensor = castMPSTensor(mpsGraph, newCachedGraph->outputTensor, output_.scalar_type());
           }
         }
@@ -138,7 +159,11 @@ void div_mode_template(const Tensor& self, const Tensor& other,
     MPSGraphTensor* divTensor =  [mpsGraph divisionWithPrimaryTensor:primaryCastTensor
                                                      secondaryTensor:secondaryCastTensor
                                                                 name:nil];
-    if (!rounding_mode.has_value()) {
+    // Rounding is a no-op for integral types, and also a reasonable workaround
+    // For MPSGraph bug on Apple Silicon, that throws `Function floorOp_i64 was not found in the library`
+    // See https://github.com/pytorch/pytorch/issues/84995
+    bool isFloatOutput = ([divTensor dataType] & MPSDataTypeFloatBit) != 0;
+    if (!rounding_mode.has_value() || !isFloatOutput) {
       return divTensor;
     } else if (*rounding_mode == "trunc") {
       return trunc_tensor(mpsGraph, divTensor);
diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm
index 3c2ab0d6c2f8..b99e87661e30 100644
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@@ -36,7 +36,7 @@
 // Copy sourceBuffer into destBuffer, casting sourceBuffer to src.scalar_type().
 // The shapes and dtypes are taken from dst and src, but their storage pointers are not used.
 void copy_cast_mps(at::Tensor& dst, const at::Tensor& src,
-                   id<MTLBuffer> destBuffer, id<MTLBuffer> sourceBuffer) {
+                   id<MTLBuffer> destBuffer, id<MTLBuffer> sourceBuffer, bool non_blocking = true) {
   using namespace mps;
 
   struct CachedGraph : public MPSCachedGraph
@@ -84,6 +84,8 @@ void copy_cast_mps(at::Tensor& dst, const at::Tensor& src,
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{cachedGraph->inputTensor_: srcData};
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{cachedGraph->outputTensor_: dstData};
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    if (!non_blocking)
+      stream->synchronize(SyncType::COMMIT_AND_WAIT);
   }
 }
 
@@ -113,38 +115,52 @@ void copy_cast_mps(at::Tensor& dst, const at::Tensor& src,
     src = src_;
   }
   id<MTLBuffer> sourceBuffer = getMTLBufferStorage(src);
-  size_t src_total_size = src_.is_view() ? at::detail::computeStorageNbytesContiguous(src.sizes(), src.element_size(), src.storage_offset()) :
-                                           src.nbytes();
-  size_t size_to_copy = src.nbytes();
-
-  // In case of dtype change, first convert src inplace
-  if (src_.dtype() != dst_.dtype()) {
-    copy_cast_mps(dst, src, sourceBuffer, sourceBuffer);
-    // Use the element size of dst to calculate the total size after casting
-    size_to_copy = (size_to_copy / src.element_size()) * dst.element_size();
-  }
-
-  // If there's anything wrong with source, we shouldn't return dst_ silently and must error out.
-  TORCH_INTERNAL_ASSERT(sourceBuffer && size_to_copy > 0);
-  TORCH_INTERNAL_ASSERT(src_total_size >= storage_byte_offset);
-  TORCH_INTERNAL_ASSERT(dst.nbytes() >= (dst.storage_offset() * dst.element_size()));
+  size_t dst_tensor_nbytes = dst.nbytes();
 
   @autoreleasepool {
     MTLResourceOptions options = MTLResourceOptionCPUCacheModeDefault | MTLResourceStorageModeShared;
     NSUInteger alignedLength = 0;
 
     void* host_dst = dst.storage().data();
-    void* alignedPtr = pageAlignedBlockPtr(host_dst, (NSUInteger)src_total_size, &alignedLength);
+    void* alignedPtr = pageAlignedBlockPtr(host_dst, (NSUInteger)dst_tensor_nbytes, &alignedLength);
+    NSUInteger destOffset = (uintptr_t(host_dst) - uintptr_t(alignedPtr));
+    // 4 bytes alignment required on macos for blits.
+    TORCH_INTERNAL_ASSERT(destOffset % 4 == 0, "Unaligned blit request");
+
     id<MTLBuffer> destBuffer = [device newBufferWithBytesNoCopy:alignedPtr
                                                          length:alignedLength
                                                         options:options
                                                     deallocator:nil];
-     NSUInteger destOffset = uintptr_t(host_dst) - uintptr_t(alignedPtr);
-    // 4 bytes alignment required on macos for blits.
-    TORCH_INTERNAL_ASSERT(destOffset % 4 == 0, "Unaligned blit request");
+    id<MTLBuffer> tmpBuffer = sourceBuffer;
+    Tensor tmp;
+    bool needsBlit = true;
+    if (src_.dtype() != dst.dtype()) {
+      if (destOffset == 0 && storage_byte_offset == 0) {
+        // Return the casted tensor directly if there's no destination offset
+        needsBlit = false;
+        tmpBuffer = destBuffer;
+      } else if (src.element_size() < dst.element_size()) {
+          tmp = at::native::empty_mps(dst.sizes(), dst.scalar_type(), c10::nullopt, kMPS);
+          tmpBuffer = getMTLBufferStorage(tmp);
+      }
+    }
+
+    size_t size_to_copy = src.nbytes();
+    // In case of dtype change, first convert src inplace
+    if (src_.dtype() != dst.dtype()) {
+      copy_cast_mps(dst, src, tmpBuffer, sourceBuffer, non_blocking);
+    }
+
+    if (needsBlit) {
+      size_to_copy = (size_to_copy / src.element_size()) * dst.element_size();
 
-    stream->copy_and_sync(sourceBuffer, destBuffer, size_to_copy, storage_byte_offset, destOffset, non_blocking);
-    [destBuffer release];
+      // If there's anything wrong with source, we shouldn't return dst_ silently and must error out.
+      TORCH_INTERNAL_ASSERT(sourceBuffer && dst_tensor_nbytes > 0);
+      TORCH_INTERNAL_ASSERT(dst_tensor_nbytes >= (dst.storage_offset() * dst.element_size()));
+
+      stream->copy_and_sync(tmpBuffer, destBuffer, size_to_copy, storage_byte_offset, destOffset, non_blocking);
+      [destBuffer release];
+    }
   }
   if (!dst.is_same(dst_)) {
     dst_.copy_(dst, non_blocking);
@@ -235,17 +251,29 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
   } else {
     src = src_;
   }
+  id<MTLBuffer> destBuffer = getMTLBufferStorage(dst_);
+  id<MTLBuffer> sourceBuffer = getMTLBufferStorage(src);
+
   // Scatter to `dst` if the memory is not contiguous
   // If the memory is not contiguous, it means that the tensor has strides and we would not be
   // able to do the copy using a single blit
   if (!dst_.is_contiguous()) {
-    return scatterViewTensor(src, dst_);
+    Tensor tmp;
+    if (src.dtype() != dst_.dtype()) {
+      id<MTLBuffer> tmpBuffer = sourceBuffer;
+      if (src.element_size() < dst_.element_size()) {
+        tmp = at::native::empty_mps(dst_.sizes(), dst_.scalar_type(), c10::nullopt, kMPS);
+        tmpBuffer = getMTLBufferStorage(tmp);
+      }
+
+      copy_cast_mps(dst_, src, tmpBuffer, sourceBuffer);
+    }
+
+    return scatterViewTensor((src.dtype() != dst_.dtype() && tmp.has_storage()) ? tmp : src, dst_);
   }
   src._set_conj(src_.is_conj());
   src._set_neg(src_.is_neg());
 
-  id<MTLBuffer> destBuffer = getMTLBufferStorage(dst_);
-  id<MTLBuffer> sourceBuffer = getMTLBufferStorage(src);
   const size_t src_size = src.nbytes();
   if (src.dtype() == dst_.dtype()) {
     MPSStream* stream = getCurrentMPSStream();
diff --git a/aten/src/ATen/native/mps/operations/Normalization.mm b/aten/src/ATen/native/mps/operations/Normalization.mm
index 2e026b9acb46..5384ee666fea 100644
--- a/aten/src/ATen/native/mps/operations/Normalization.mm
+++ b/aten/src/ATen/native/mps/operations/Normalization.mm
@@ -823,7 +823,7 @@ string get_mem_string(c10::MemoryFormat memory_format) {
   const int normalized_ndim = normalized_shape.size();
   // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
   const int axis = input_ndim - normalized_ndim;
-  at::Tensor input_reshaped = input.view({1, M, -1});
+  at::Tensor input_reshaped = input.reshape({1, M, -1});
   // Unlike Batch Normalization, which applies scalar scale and bias for each
   // entire channel/plane with the affine option, Layer Normalization applies
   // per-element scale and bias. E.g. For input {N, C, H, W}, weight for
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index 2231a66fb3ac..97f3d18626ef 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -61,6 +61,14 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
 
 MPSGraphTensor* trunc_tensor(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor)
 {
+  // Rounding is a no-op for integral types, and also a reasonable workaround
+  // For MPSGraph bug on Apple Silicon, that throws `Function floorOp_i64 was not found in the library`
+  // See https://github.com/pytorch/pytorch/issues/84995
+  bool isFloatInput = ([inputTensor dataType] & MPSDataTypeFloatBit) != 0;
+  if (!isFloatInput) {
+    return inputTensor;
+  }
+
   MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0
                                                    dataType:inputTensor.dataType];
   MPSGraphTensor* predicateTensor = [mpsGraph lessThanWithPrimaryTensor:inputTensor
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index dfb3bddc5238..f706bbd995c7 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1209,6 +1209,15 @@
 
 - func: concat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
 
+# alias for torch.cat
+- func: concatenate(Tensor[] tensors, int dim=0) -> Tensor
+
+- func: concatenate.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+
+- func: concatenate.names(Tensor[] tensors, Dimname dim) -> Tensor
+
+- func: concatenate.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
+
 - func: block_diag(Tensor[] tensors) -> Tensor
   variants: function
   dispatch:
@@ -2054,7 +2063,7 @@
     CPU: empty_cpu
     CUDA: empty_cuda
     MPS: empty_mps
-    Meta: empty_meta
+    Meta: empty_meta_symint
     MkldnnCPU: empty_mkldnn
     SparseCPU, SparseCUDA, SparseMeta: empty_sparse
     SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
@@ -2065,13 +2074,13 @@
 - func: new_empty(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   variants: method
   dispatch:
-    CompositeExplicitAutograd: new_empty
+    CompositeExplicitAutograd: new_empty_symint
   autogen: new_empty.out
 
-- func: new_empty_strided(Tensor self, int[] size, int[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: new_empty_strided(Tensor self, SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   variants: method
   dispatch:
-    CompositeExplicitAutogradNonFunctional: new_empty_strided
+    CompositeExplicitAutogradNonFunctional: new_empty_strided_symint
   autogen: new_empty_strided.out
 
 - func: new_full(Tensor self, int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -2159,12 +2168,12 @@
     SparseCsrCPU, SparseCsrCUDA: empty_like_sparse_csr
   autogen: empty_like.out
 
-- func: empty_strided(int[] size, int[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+- func: empty_strided(SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   dispatch:
     CPU: empty_strided_cpu
     CUDA: empty_strided_cuda
     MPS: empty_strided_mps
-    Meta: empty_strided_meta
+    Meta: empty_strided_meta_symint
     QuantizedCPU, QuantizedCUDA: empty_strided_unknown_quantized
   autogen: empty_strided.out
 
@@ -5287,12 +5296,24 @@
     CUDA: nested_from_padded_cuda
   autogen: _nested_from_padded.out
 
+# These private functions are temporary. They will be updated/deleted when nested tensors switch to using SymInts for their metadata representation
 - func: _nested_tensor_size(Tensor self) -> Tensor
   variants: method
   dispatch:
     NestedTensorCPU, NestedTensorCUDA: _nested_tensor_size
   autogen: _nested_tensor_size.out
 
+- func: _nested_tensor_strides(Tensor self) -> Tensor
+  variants: method
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: _nested_tensor_strides
+  autogen: _nested_tensor_strides.out
+
+- func: _nested_tensor_offsets(Tensor self) -> int[]
+  variants: method
+  dispatch:
+    NestedTensorCPU, NestedTensorCUDA: _nested_tensor_offsets
+
 # _nested_from_padded is not usable from Python, so
 # _nested_from_padded_and_nested_example is available for testing.
 - func: _nested_from_padded_and_nested_example(Tensor padded, Tensor nt_example) -> Tensor
@@ -5300,6 +5321,22 @@
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example
   autogen: _nested_from_padded_and_nested_example.out
 
+# The input arguments' types to this functions are temporary. When nested tensors switch to using SymInts for their metadata representation
+# this will need to be updated
+- func: _nested_view_from_buffer(Tensor(a) self, Tensor nested_size, Tensor nested_strides, int[] offsets) -> Tensor(a)
+  variants: function
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: _nested_view_from_buffer
+
+- func: _nested_view_from_buffer_copy(Tensor self, Tensor nested_size, Tensor nested_strides, int[] offsets) -> Tensor
+  variants: function
+  device_check: NoCheck
+  tags: view_copy
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _nested_view_from_buffer_copy
+  autogen: _nested_view_from_buffer_copy.out
+
 - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor
   dispatch:
     # calls unsqueeze
@@ -5548,7 +5585,7 @@
 
 - func: zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   dispatch:
-    CompositeExplicitAutograd: zeros
+    CompositeExplicitAutograd: zeros_symint
 
 - func: zeros.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -6271,6 +6308,7 @@
   dispatch:
     SparseCPU, SparseCUDA, SparseMeta: values_sparse
     SparseCsrCPU, SparseCsrCUDA: values_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: values_nested
   device_check: NoCheck
   device_guard: False
 
@@ -6319,11 +6357,12 @@
     SparseCPU, SparseCUDA: copy_sparse_
   autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out
 
+# By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors
 - func: unbind.int(Tensor(a -> *) self, int dim=0) -> Tensor(a)[]
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: unbind
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_unbind
+    CompositeImplicitAutogradNestedTensor: NestedTensor_unbind
 
 - func: unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[]
   variants: function, method
@@ -6889,8 +6928,7 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
-    Meta: view_meta
-    ZeroTensor, CPU, CUDA, QuantizedCPU, QuantizedCUDA, MPS: view
+    ZeroTensor, Meta, CPU, CUDA, QuantizedCPU, QuantizedCUDA, MPS: view
     MkldnnCPU: mkldnn_view
     NestedTensorCPU, NestedTensorCUDA: view_nested
 
@@ -8114,15 +8152,6 @@
     CUDA: _symeig_helper_cuda
   autogen: _symeig_helper.out
 
-- func: eig.e(Tensor self, bool eigenvectors=False, *, Tensor(a!) e, Tensor(b!) v) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
-  dispatch:
-    CompositeExplicitAutograd: eig_out
-
-- func: eig(Tensor self, bool eigenvectors=False) -> (Tensor eigenvalues, Tensor eigenvectors)
-  variants: method, function
-  dispatch:
-    CompositeExplicitAutograd: eig
-
 - func: svd.U(Tensor self, bool some=True, bool compute_uv=True, *, Tensor(a!) U, Tensor(b!) S, Tensor(c!) V) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) V)
 
 - func: svd(Tensor self, bool some=True, bool compute_uv=True) -> (Tensor U, Tensor S, Tensor V)
@@ -12568,6 +12597,21 @@
 - func: linalg_multi_dot.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
   python_module: linalg
 
+## Functions related to the `torch.nested` namespace
+# Note [nested namespace binding]
+# Functions in the nested python module should have their names start with
+#   "nested_" underscore and be bound to the desired Python name in
+#   torch/nested/__init__.py, and the desired C++ name in torch/csrc/api/include/torch/nested.h.
+#   The "nested_" names should be hidden from the user and not documented.
+
+- func: nested_to_padded_tensor(Tensor self, float padding, int[]? output_size=None) -> Tensor
+  python_module: nested
+  variants: function
+  dispatch:
+    NestedTensorCPU: NestedTensor_to_padded_tensor_generic
+    NestedTensorCUDA: NestedTensor_to_padded_tensor_cuda
+  autogen: nested_to_padded_tensor.out
+
 ## Functions that are only for testing
 # It is undocumented and should not be used outside of tests.
 - func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor
@@ -12938,7 +12982,7 @@
 - func: expand_copy.out(Tensor self, SymInt[] size, *, bool implicit=False, Tensor(a!) out) -> Tensor(a!)
   variants: function
   dispatch:
-    CompositeExplicitAutograd: expand_copy_out
+    CompositeExplicitAutograd: expand_copy_out_symint
 
 
 - func: permute_copy.out(Tensor self, int[] dims, *, Tensor(a!) out) -> Tensor(a!)
@@ -13058,7 +13102,7 @@
 - func: view_copy.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   dispatch:
-    CompositeExplicitAutograd: view_copy_out
+    CompositeExplicitAutograd: view_copy_out_symint
 
 
 - func: view_copy.dtype_out(Tensor self, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
@@ -13078,13 +13122,6 @@
   dispatch:
     CompositeExplicitAutograd: alias_copy_out
 
-- func: to_padded_tensor(Tensor self, float padding, int[]? output_size=None) -> Tensor
-  variants: method
-  dispatch:
-    NestedTensorCPU: NestedTensor_to_padded_tensor_generic
-    NestedTensorCUDA: NestedTensor_to_padded_tensor_cuda
-  autogen: to_padded_tensor.out
-
 - func: _nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor
   dispatch:
     NestedTensorCPU: NestedTensor_softmax_dropout
@@ -13137,6 +13174,11 @@
   structured: True
   variants: function
 
+- func: _flash_scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool causal) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: flash_scaled_dot_product_attention
+
 - func: _transformer_decoder_only_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, Tensor? incr_key=None, Tensor? incr_value=None) -> (Tensor, Tensor, Tensor)
   variants: function
   dispatch:
diff --git a/aten/src/ATen/native/nested/NestedTensorMath.cpp b/aten/src/ATen/native/nested/NestedTensorMath.cpp
index 97a09d9bc2a2..073cad74188c 100644
--- a/aten/src/ATen/native/nested/NestedTensorMath.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorMath.cpp
@@ -74,7 +74,8 @@ std::vector<at::Tensor> NestedTensor_unbind(
   if (ntensors == 0) {
     return result_tensors;
   }
-  const at::Tensor& buffer = self_ptr->get_buffer();
+  // This returns a differentiable view of self as a regular tensor
+  auto buffer = self.values();
   std::vector<IntArrayRef> sizes = NestedTensor_get_sizes(self_ptr),
       strides = NestedTensor_get_strides(self_ptr);
   const std::vector<int64_t>& offsets = self_ptr->get_offsets();
@@ -1138,6 +1139,60 @@ Tensor view_nested(const Tensor& self, IntArrayRef proposed_shape) {
       "Use .reshape(...) instead.");
   return create_nested_view_tensor(self, sizemat_reshaped, stridemat_reshaped, std::vector<int64_t>(self_ptr->get_offsets()));
 }
+  /**
+   * Create a buffer tensor that is a view of self
+   *
+   * This serves as the boundary between nested and non nested tensor
+   * view conversions
+   *
+   * @return Returns a new non nested tensor that
+   * aliases the same storage as self
+   */
+Tensor values_nested(const Tensor& self) {
+  TORCH_INTERNAL_ASSERT(self.is_nested(), "Can only create a buffer from Nested Tensor");
+  auto* nt_self = get_nested_tensor_impl(self);
+  return nt_self->get_buffer();
+}
+
+/**
+ * Create a nested tensor that is a view of a buffer
+ *
+ * This serves as the boundary between non nested tensor and nested
+ * view conversions
+ *
+ * @return Returns a nested tensor that
+ * aliases the same storage as buffer
+ */
+Tensor _nested_view_from_buffer(
+    const Tensor& buffer,
+    const Tensor& nested_size_tensor,
+    const Tensor& nested_stride_tensor,
+    IntArrayRef offsets) {
+  TORCH_INTERNAL_ASSERT(
+      !buffer.is_nested(),
+      "Can only a create Nested Tensor from a normal tensor buffer");
+  TORCH_INTERNAL_ASSERT(buffer.dim() == 1, "The input buffer must be flat");
+  TORCH_INTERNAL_ASSERT(nested_size_tensor.dim() == 2, "Expected the nested size tensor to be two dimensional.");
+  uint64_t num_elements_nested_size = at::prod(nested_size_tensor, 1).sum().item<int64_t>();
+  uint64_t buffer_storage_size = buffer.storage().nbytes()/buffer.dtype().itemsize();
+  TORCH_INTERNAL_ASSERT(
+      buffer_storage_size == num_elements_nested_size,
+      "The number of elements in the buffer must equal the nested tensor size but buffer size: ",
+      buffer_storage_size,
+      " and nested tensor size: ",
+      num_elements_nested_size,
+      ".");
+
+  TORCH_INTERNAL_ASSERT(nested_stride_tensor.dim() == 2, "Expected the nested stride tensor to be two dimensional.");
+  TORCH_INTERNAL_ASSERT(nested_size_tensor.size(0) == nested_stride_tensor.size(0), "Expected the first dimension of nested size and nested stride tensor to be equal.");
+  TORCH_INTERNAL_ASSERT(nested_stride_tensor.size(0) == (int64_t)offsets.size(), "Expected the first dimension of nested stride tensor to equal the length of offsets.");
+  return at::detail::make_tensor<NestedTensorImpl>(
+    c10::TensorImpl::VIEW,
+    buffer,
+    nested_size_tensor,
+    nested_stride_tensor,
+    std::vector<int64_t>(offsets.begin(), offsets.end()));
+}
 
 // See Note [Special size rule for nested tensor]
 Tensor reshape_nested(const Tensor& self, IntArrayRef proposed_shape) {
@@ -1151,7 +1206,7 @@ Tensor reshape_nested(const Tensor& self, IntArrayRef proposed_shape) {
       ntensors > 0,
       "empty nested tensor cannot be reshaped");
   // basic information after reshaping
-  int64_t ntensors_reshaped;
+  int64_t ntensors_reshaped{0};
   if (proposed_shape[0] >= 0) {
     ntensors_reshaped = proposed_shape[0];
   }
@@ -1169,7 +1224,7 @@ Tensor reshape_nested(const Tensor& self, IntArrayRef proposed_shape) {
   // reshaping underlying tensor dimensions does not change offset
   // determine reshaped size and stride
   const Tensor& sizemat = self_ptr->get_nested_size_tensor();
-  bool viewable;
+  bool viewable{false};
   Tensor sizemat_reshaped, stridemat_reshaped;
   std::tie(viewable, sizemat_reshaped, stridemat_reshaped) = NestedTensor_compute_size_stride(
       sizes, strides, proposed_shape, sizemat.options());
diff --git a/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp
index 231eca94f072..35a1c83e2360 100644
--- a/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp
@@ -7,6 +7,7 @@
 
 #include <c10/util/string_view.h>
 #include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
 
 namespace at {
 namespace native {
@@ -243,5 +244,204 @@ Tensor NestedTensor_to_mask(const Tensor& nt, c10::optional<int64_t> mask_dim, c
   }
   return result;
 }
+std::tuple<Tensor, int64_t> cumulative_and_max_seq_len(Tensor qkv) {
+  TORCH_CHECK(
+      qkv.is_nested(),
+      "QKV must be nested for flash cumulative_seq_len calculation.")
+  auto* nt_impl = get_nested_tensor_impl(qkv);
+  const auto& sizes = nt_impl->get_nested_size_tensor();
+  auto size_tensor_stride = sizes.stride(0);
+
+  const int64_t batch_size = qkv.size(0);
+  auto cumulative_seqlen = at::zeros(
+      {batch_size + 1}, TensorOptions().device(at::kCPU).dtype(at::kInt));
+
+  auto* sizes_ptr = sizes.data_ptr<int64_t>();
+  auto* cumulative_seqlen_ptr = cumulative_seqlen.data_ptr<int32_t>();
+
+  int32_t sum = 0;
+  int64_t max_seqlen = -1;
+  cumulative_seqlen_ptr[0] = sum;
+  for (const auto i : c10::irange(batch_size)) {
+    // Calculate the cumulative sum of the sequence lengths
+    auto current_seq_len = sizes_ptr[i * size_tensor_stride];
+    sum += current_seq_len;
+    cumulative_seqlen_ptr[i + 1] = sum;
+
+    // Find the max element while we traverse
+    max_seqlen = std::max(max_seqlen, current_seq_len);
+  }
+  // Send to GPU, this is pretty light weight calc for normal batch size
+  // but maybe this needs to be on gpu
+  cumulative_seqlen = cumulative_seqlen.to(TensorOptions().device(at::kCUDA));
+  return std::tuple<Tensor, int64_t>{cumulative_seqlen, max_seqlen};
+}
+
+Tensor flash_attention_helper(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    double dropout_p,
+    bool causal) {
+  //  Query is of size (batch_size x ragged_seq_len x (3 or 1) x n_heads x
+  //  head_did
+  int64_t head_dim{query.size(-1)};
+  int64_t num_heads{query.size(-2)};
+
+  auto cumulative_and_max_q = cumulative_and_max_seq_len(query);
+  Tensor cumulative_sequence_length_q = std::get<0>(cumulative_and_max_q);
+  int64_t max_seqlen_batch_q = std::get<1>(cumulative_and_max_q);
+
+  if (key.is_same(value) || query.is_same(key) || query.is_same(value)) {
+    int64_t Nnz_q{cumulative_sequence_length_q[-1].item<int64_t>()};
+
+    // For the packed case we need to set the output size for dim 2 to 1
+    auto atten_size = get_nested_size_tensor(query).clone();
+    atten_size.index({at::indexing::Slice(), 1}) = 1;
+
+    auto qkv_buffer_reshaped =
+        get_buffer(query).view({Nnz_q, 3, num_heads, head_dim}).transpose(0, 1).contiguous();
+
+    auto i0 = qkv_buffer_reshaped[0];
+    auto i1 = qkv_buffer_reshaped[1];
+    auto i2 = qkv_buffer_reshaped[2];
+
+    TORCH_CHECK(i0.is_contiguous());
+    TORCH_CHECK(i1.is_contiguous());
+    TORCH_CHECK(i2.is_contiguous());
+
+    // If we are passing in query, key, value all the same tensors then we have
+    // packed them into one tensor and need to slice for flash attention
+    Tensor atten_buffer = at::_flash_scaled_dot_product_attention(
+        i0,
+        i1,
+        i2,
+        cumulative_sequence_length_q,
+        cumulative_sequence_length_q,
+        max_seqlen_batch_q,
+        max_seqlen_batch_q,
+        dropout_p,
+        causal);
+    // Output of flash_attention is a regular tensor lets wrap it back up to
+    // form a nested tensor
+    return wrap_buffer(atten_buffer.view(-1), atten_size);
+  }
+
+  // Query, Key, and Value are not all the same tensor and therefore need to
+  // calculate K meta data
+
+  // The nested tensors will be of shape {Batch_size x ragged_seq_len x
+  // num_heads * head_dim }
+  auto cumulative_and_max_k = cumulative_and_max_seq_len(key);
+  Tensor cumulative_sequence_length_k = std::get<0>(cumulative_and_max_k);
+  int64_t max_seqlen_batch_k = std::get<1>(cumulative_and_max_k);
+
+  // K and V have to have the same Nnz, should probably torch_check before now
+  // assume in order to not iterate over v
+  int64_t Nnz_q{cumulative_sequence_length_q[-1].item<int64_t>()};
+  int64_t Nnz_kv{cumulative_sequence_length_k[-1].item<int64_t>()};
+
+  auto query_buffer_reshaped =
+      get_buffer(query).view({Nnz_q, num_heads, head_dim});
+  auto key_buffer_reshaped =
+      get_buffer(key).view({Nnz_kv, num_heads, head_dim});
+  auto value_buffer_reshaped =
+      get_buffer(value).view({Nnz_kv, num_heads, head_dim});
+
+  Tensor atten_buffer = at::_flash_scaled_dot_product_attention(
+      query_buffer_reshaped,
+      key_buffer_reshaped,
+      value_buffer_reshaped,
+      cumulative_sequence_length_q,
+      cumulative_sequence_length_k,
+      max_seqlen_batch_q,
+      max_seqlen_batch_k,
+      dropout_p,
+      causal);
+  // Output of flash_attention is a regular tensor lets wrap it back up to
+  // form a nested tensor, the size of which should match the query tensor
+  return wrap_buffer(atten_buffer.view(-1), get_nested_size_tensor(query));
+}
+
+Tensor flash_attention_helper_dense(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    double dropout_p,
+    bool causal) {
+  TORCH_INTERNAL_ASSERT(
+      !query.is_nested() && !key.is_nested() && !value.is_nested());
+  //  Query is of size (batch_size x dense_seq_len x 3 x n_heads
+  //  head_dim)
+  const auto batch_size = query.size(0);
+  auto max_seqlen_batch_q = query.size(1);
+  int64_t head_dim{query.size(-1)};
+  int64_t num_heads{query.size(-2)};
+
+  auto cumulative_sequence_length_q = at::arange(
+      0,
+      (batch_size + 1) * max_seqlen_batch_q,
+      max_seqlen_batch_q,
+      TensorOptions().device(at::kCUDA).dtype(at::kInt));
+  int64_t Nnz_q{batch_size * max_seqlen_batch_q};
+
+  if (key.is_same(value) || query.is_same(key) || query.is_same(value)) {
+    // In the dense case flash attention expects an input that is
+    // (b*s) x num_heads x head_dim
+    auto query_reshaped = query.reshape({Nnz_q, 3, num_heads, head_dim});
+    // If we are passing in query, key, value all the same tensors than we have
+    // packed them into one tensor and need to slice for flash attention
+
+    Tensor atten_buffer = at::_flash_scaled_dot_product_attention(
+        query_reshaped.index({at::indexing::Slice(), 0}),
+        query_reshaped.index({at::indexing::Slice(), 1}),
+        query_reshaped.index({at::indexing::Slice(), 2}),
+        cumulative_sequence_length_q,
+        cumulative_sequence_length_q,
+        max_seqlen_batch_q,
+        max_seqlen_batch_q,
+        dropout_p,
+        causal);
+    // Reshape output to convert nnz to batch_size and seq_len
+    return atten_buffer.reshape(
+        {batch_size, max_seqlen_batch_q, num_heads, head_dim});
+  }
+
+  // Query, Key, and Value are not all the same tensor and therefore need to
+  // calculate K meta data
+  auto max_seqlen_batch_k = key.size(1);
+  auto cumulative_sequence_length_k = at::arange(
+      0,
+      (batch_size + 1) * max_seqlen_batch_k,
+      max_seqlen_batch_k,
+      TensorOptions().device(at::kCUDA).dtype(at::kInt));
+
+  // K and V have to have the same Nnz, should probably torch_check before
+  // assume for now in order to not iterate over v
+  int64_t Nnz_kv{batch_size * max_seqlen_batch_k};
+
+  // Calculate head dim
+  TORCH_INTERNAL_ASSERT(query.size(-1) == key.size(-1));
+  TORCH_INTERNAL_ASSERT(query.size(-1) == value.size(-1));
+
+  auto query_reshaped = query.reshape({Nnz_q, num_heads, head_dim});
+  auto key_reshaped = key.reshape({Nnz_kv, num_heads, head_dim});
+  auto value_reshaped = value.reshape({Nnz_kv, num_heads, head_dim});
+
+  Tensor atten_buffer = at::_flash_scaled_dot_product_attention(
+      query_reshaped,
+      key_reshaped,
+      value_reshaped,
+      cumulative_sequence_length_q,
+      cumulative_sequence_length_k,
+      max_seqlen_batch_q,
+      max_seqlen_batch_k,
+      dropout_p,
+      causal);
+  // Reshape output to convert nnz to batch_size and seq_len
+  return atten_buffer.reshape(
+      {batch_size, max_seqlen_batch_q, num_heads, head_dim});
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.h b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.h
index 77eb0145d684..09b35d9c39e9 100644
--- a/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.h
+++ b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.h
@@ -83,5 +83,19 @@ void add_padding_kernelLauncher(
     const std::vector<int64_t>& output_sizes,
     const int batch_size,
     const int output_batch_size);
+
+Tensor flash_attention_helper_dense(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    double dropout_p,
+    bool causal);
+
+Tensor flash_attention_helper(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    double dropout_p,
+    bool causal);
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/nested/NestedTensorUtils.cpp b/aten/src/ATen/native/nested/NestedTensorUtils.cpp
index 74fd9773c366..0d560849af06 100644
--- a/aten/src/ATen/native/nested/NestedTensorUtils.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorUtils.cpp
@@ -14,6 +14,13 @@ at::Tensor _nested_tensor_size(const at::Tensor& self) {
   return get_nested_size_tensor(self);
 }
 
+at::Tensor _nested_tensor_strides(const at::Tensor& self){
+  return  get_nested_tensor_impl(self) -> get_nested_stride_tensor();
+}
+std::vector<int64_t> _nested_tensor_offsets(const at::Tensor& self){
+  return get_nested_tensor_impl(self) -> get_offsets();
+}
+
 // Helper functions for getting information about a nested tensor's shape.
 std::vector<int64_t> NestedTensor_get_max_size_from_size_tensor(
     const Tensor& sizes) {
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
index 54d2b7ffd0c5..1cfeac9a5e12 100644
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
@@ -1,4 +1,5 @@
 #include <type_traits>
+#include <c10/util/Exception.h>
 
 #include <ATen/ATen.h>
 #include <ATen/NestedTensorImpl.h>
@@ -9,10 +10,19 @@
 #include <ATen/ops/_nested_from_padded.h>
 #endif
 
+// TODO Consider moving all flash_attention code, nested tensor included to
+// Transformer library
+
+#ifdef USE_FLASH_ATTENTION
+#include <ATen/native/transformers/cuda/flash_attn/fmha_api.h>
+#endif
+
 #include <ATen/native/nested/NestedTensorTransformerFunctions.h>
 #include <ATen/native/nested/NestedTensorMath.h>
 #include <ATen/native/nested/NestedTensorUtils.h>
 
+#include <ATen/cuda/CUDAContext.h>
+
 namespace at {
 namespace native {
 namespace {
@@ -207,5 +217,37 @@ Tensor NestedTensor_to_padded_tensor_cuda(
   return NestedTensor_to_padded_tensor_generic(t, padding, output_size);
 }
 
+Tensor flash_scaled_dot_product_attention(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const Tensor& cumulative_sequence_length_q,
+    const Tensor& cumulative_sequence_length_k,
+    const int64_t max_seqlen_batch_q,
+    const int64_t max_seqlen_batch_k,
+    double dropout_p,
+    bool causal) {
+#if defined(USE_FLASH_ATTENTION)
+  auto softmax_scale = std::pow(query.size(-1), -0.5);
+  std::vector<Tensor> output = fmha::mha_fwd(
+      query,
+      key,
+      value,
+      cumulative_sequence_length_q,
+      cumulative_sequence_length_k,
+      max_seqlen_batch_q,
+      max_seqlen_batch_k,
+      dropout_p,
+      softmax_scale,
+      false,
+      causal,
+      false,
+      c10::nullopt);
+  return output[0];
+#endif
+  TORCH_CHECK(false, "USE_FLASH_ATTENTION was not enabled for build.")
+  return Tensor{};
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/quantized/AffineQuantizerBase.cpp b/aten/src/ATen/native/quantized/AffineQuantizerBase.cpp
index e40f8ef1fdb0..5d02d9e04ed7 100644
--- a/aten/src/ATen/native/quantized/AffineQuantizerBase.cpp
+++ b/aten/src/ATen/native/quantized/AffineQuantizerBase.cpp
@@ -71,6 +71,33 @@ void quantize_vec(
           (float)scale, (int32_t)zero_point, precision});
 }
 
+#if defined(__ARM_NEON__) || defined(__aarch64__)
+// For use when compiling FBGEMM on aarch64 but still supporting x86
+// intrinsics via simde
+template <typename T>
+T quantize_val_arm(
+    const float scale,
+    const int32_t zero_point,
+    const float value) {
+  constexpr int32_t qmin = std::numeric_limits<T>::min();
+  constexpr int32_t qmax = std::numeric_limits<T>::max();
+  float inv_scale = 1.0f / scale;
+  auto r = zero_point + static_cast<int32_t>(std::nearbyint(value * inv_scale));
+  r = std::max(r, qmin);
+  r = std::min(r, qmax);
+  return static_cast<T>(r);
+}
+
+template uint8_t quantize_val_arm<uint8_t>(
+    const float scale,
+    const int32_t zero_point,
+    const float value);
+template int8_t quantize_val_arm<int8_t>(
+    const float scale,
+    const int32_t zero_point,
+    const float value);
+#endif
+
 template <typename T>
 inline float dequantize_val(double scale, int64_t zero_point, T value) {
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
diff --git a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
index 6ad70356b3e0..4ae88871e9f5 100644
--- a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
@@ -4,8 +4,7 @@
 #if AT_MKLDNN_ENABLED()
 #include <ATen/Tensor.h>
 #include <ATen/native/quantized/PackedParams.h>
-#include <ATen/native/mkldnn/MKLDNNCommon.h>
-#include <ATen/native/mkldnn/Utils.h>
+#include <ideep.hpp>
 
 struct PackedLinearWeightsOnednn : public LinearPackedParamsBase {
   PackedLinearWeightsOnednn(
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/cmake/DownloadGoogleTest.cmake b/aten/src/ATen/native/quantized/cpu/qnnpack/cmake/DownloadGoogleTest.cmake
index 4a86d641e412..66b2232b5925 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/cmake/DownloadGoogleTest.cmake
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/cmake/DownloadGoogleTest.cmake
@@ -11,7 +11,7 @@ project(googletest-download NONE)
 include(ExternalProject)
 ExternalProject_Add(googletest
   URL https://github.com/google/googletest/archive/release-1.10.0.zip
-  URL_HASH SHA256=f3ed3b58511efd272eb074a3a6d6fb79d7c2e6a0e374323d1e6bcbcc1ef141bf
+  URL_HASH SHA256=94c634d499558a76fa649edb13721dce6e98fb1e7018dfaeba3cd7a083945e91
   SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/googletest"
   BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/googletest"
   CONFIGURE_COMMAND ""
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/CMakeLists.txt b/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/CMakeLists.txt
index f19d6c61f33f..e763e4e3ba93 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/CMakeLists.txt
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/CMakeLists.txt
@@ -63,7 +63,7 @@ set_target_properties(clog PROPERTIES
   C_EXTENSIONS NO)
 CLOG_TARGET_RUNTIME_LIBRARY(clog)
 set_target_properties(clog PROPERTIES PUBLIC_HEADER include/clog.h)
-target_include_directories(clog BEFORE PUBLIC include)
+target_include_directories(clog PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include> $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
 if(CLOG_LOG_TO_STDIO)
   target_compile_definitions(clog PRIVATE CLOG_LOG_TO_STDIO=1)
 else()
@@ -73,7 +73,10 @@ if(ANDROID AND NOT CLOG_LOG_TO_STDIO)
   target_link_libraries(clog PRIVATE log)
 endif()
 
+add_library(cpuinfo::clog ALIAS clog)
+
 install(TARGETS clog
+  EXPORT cpuinfo-targets
   LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
   ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
   PUBLIC_HEADER DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/cmake/DownloadGoogleTest.cmake b/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/cmake/DownloadGoogleTest.cmake
index 4a86d641e412..66b2232b5925 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/cmake/DownloadGoogleTest.cmake
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/cmake/DownloadGoogleTest.cmake
@@ -11,7 +11,7 @@ project(googletest-download NONE)
 include(ExternalProject)
 ExternalProject_Add(googletest
   URL https://github.com/google/googletest/archive/release-1.10.0.zip
-  URL_HASH SHA256=f3ed3b58511efd272eb074a3a6d6fb79d7c2e6a0e374323d1e6bcbcc1ef141bf
+  URL_HASH SHA256=94c634d499558a76fa649edb13721dce6e98fb1e7018dfaeba3cd7a083945e91
   SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/googletest"
   BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/googletest"
   CONFIGURE_COMMAND ""
diff --git a/aten/src/ATen/native/sparse/Macros.h b/aten/src/ATen/native/sparse/Macros.h
new file mode 100644
index 000000000000..10174e9ad655
--- /dev/null
+++ b/aten/src/ATen/native/sparse/Macros.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#define GPUCC
+#define FUNCAPI __host__ __device__
+#define INLINE __forceinline__
+#else
+#define FUNCAPI
+#define INLINE inline
+#endif
+
+#if defined(_WIN32) || defined(_WIN64)
+#define RESTRICT __restrict
+#else
+#define RESTRICT __restrict__
+#endif
diff --git a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
new file mode 100644
index 000000000000..6e2a50a3c1f4
--- /dev/null
+++ b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
@@ -0,0 +1,532 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/sparse/Macros.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/SparseTensorUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors.h>
+#include <ATen/ops/from_blob.h>
+#include <ATen/ops/result_type.h>
+#endif
+
+#ifdef GPUCC
+#define NAME "sparse_binary_op_intersection_cuda"
+#else
+#define NAME "sparse_binary_op_intersection_cpu"
+#endif
+
+#define CALL(...) __VA_ARGS__();
+#define EXPAND(b, n, ...)         \
+  if (b) {                        \
+    using index_t ## n = int32_t; \
+    __VA_ARGS__                   \
+  }                               \
+  else {                          \
+    using index_t ## n = int64_t; \
+    __VA_ARGS__                   \
+  }
+#define BOOL_TO_INDEX_TYPE1(b0, ...) \
+  EXPAND(b0, 0, CALL(__VA_ARGS__))
+#define BOOL_TO_INDEX_TYPE2(b1, b0, ...) \
+  EXPAND(b1, 1, BOOL_TO_INDEX_TYPE1(b0, __VA_ARGS__))
+#define BOOL_TO_INDEX_TYPE3(b2, b1, b0, ...) \
+  EXPAND(b2, 2, BOOL_TO_INDEX_TYPE2(b1, b0, __VA_ARGS__))
+
+namespace at {
+namespace native {
+
+namespace {
+
+using at::sparse::get_sparse_impl;
+
+// ForwardIt: only legacy random access iterator is supported.
+template<class ForwardIt, class T, bool is_lower = true>
+static FUNCAPI INLINE
+ForwardIt find_bound(ForwardIt first, ForwardIt last, const T& value) {
+    ForwardIt RESTRICT it;
+    typename std::iterator_traits<ForwardIt>::difference_type count, step;
+    // NOTE: std::distance(first, last) compiles but produces wrong results on CUDA,
+    // so only legacy random access iterators are safe in this code.
+    count = last - first;
+
+    while (count > 0) {
+      it = first;
+      step = count / 2;
+      // avoiding std::advance(it, step),
+      // although it does work unlike std::distance on CUDA.
+      it += step;
+      // The decision which separates finding a lower bound vs an upper bound.
+      // Note that a lower bound is a value at *it with the smallest index
+      // such that *it >= value if such value exists, or last if does not.
+      // Similarly, an upper bound is a value at *it with the smallest index
+      // such that *it > value if such value exists, or last if does not.
+      // Let is_lower = true and *it < value, then we know that *it and values
+      // preceeding *it cannot contain a lower bound, so we adjust initial iterator range
+      // from [first, first + count] to [first + step + 1, first + count - (step + 1)],
+      // where +1 skips the element at which we have just evaluated *it < value.
+      // Samilar logic holds when is_lower = false.
+      if (is_lower ? *it < value : value >= *it) {
+        first = ++it;
+        count -= step + 1;
+      }
+      else {
+        count = step;
+      }
+    }
+    return first;
+}
+
+template <template <typename func_t> class kernel_t>
+struct KernelLauncher {
+  template <typename func_t>
+  static void launch(TensorIteratorBase& iter, const func_t& f) {
+    kernel_t<func_t>::launch(iter, f);
+  }
+};
+
+template <
+  template <typename func_t> class kernel_t,
+  typename binary_op_t,
+  typename index_t = int64_t,
+  typename hash_t = int64_t,
+  typename offset_t = int64_t>
+void _sparse_binary_op_intersection_kernel_impl(
+    Tensor& res,
+    const Tensor& x_,
+    const Tensor& y_,
+    const std::vector<int64_t> broadcasted_shape,
+    const bool commutes_with_sum = true
+) {
+  // The common dtype check is relevant when op is done in-place.
+  // This is because binary_of_t produces new values and it could be that
+  // new_values.dtype != res.dtype. In such a case we should error out
+  // as soon as possible to avoid redundant kernel runs.
+  const auto common_dtype = at::result_type(x_, y_);
+  TORCH_CHECK(canCast(common_dtype, res.scalar_type()),
+      "Can't convert result type ", common_dtype,
+      " to output ", res.scalar_type());
+
+  using KernelLauncher = KernelLauncher<kernel_t>;
+
+  const Tensor x = commutes_with_sum ? x_ : x_.coalesce();
+  const Tensor y = commutes_with_sum ? y_ : y_.coalesce();
+
+  // Given sparse tensors x and y we decide which one is source, and which one
+  // is probably_coalesced. The indices of both source and probably_coalesced are
+  // hashed and then the hash values of the source's indices are binary-searched
+  // into the hash values of the probably_coalesced's indices.
+  // If probably_coalesce is coalesced, by the property of the hashing method
+  // (see below), the hash values are already sorted and we can avoid any
+  // explicit sorting routines.
+  Tensor probably_coalesced, source;
+  std::tie(probably_coalesced, source) = [&]() -> std::tuple<Tensor, Tensor> {
+    // Case 1: either x or y is coalesced.
+    if ((x.is_coalesced() ^ y.is_coalesced())) {
+      return x.is_coalesced()
+        ? std::make_tuple(x, y)
+        : std::make_tuple(y, x);
+    }
+    // Case 2: Both x and y are either coalesced or non-coalesced.
+    // If both are coalesced, search into the larger tensor is faster.
+    // Same holds when both are non-coalesced.
+    else {
+      Tensor larger, smaller;
+      std::tie(larger, smaller) = [&]() -> std::tuple<Tensor, Tensor> {
+        return x._nnz() >= y._nnz()
+          ? std::make_tuple(x, y)
+          : std::make_tuple(y, x);
+      }();
+
+      // If under a uniform distribution it is likely to hit many elements in larger,
+      // it is best to coalesce it for better performance.
+      const auto larger_sizes = larger.sizes();
+      const auto sparse_dim_numel = std::accumulate(
+          larger_sizes.begin(),
+          larger_sizes.begin() + larger.sparse_dim(),
+          1,
+          std::multiplies<int64_t>());
+      // If nnz > prod(larger.shape[:sparse_dim]), by the pidgeonhole principle,
+      // there is at least one bucket with nnz / prod(larger.shape[:sparse_dim]) elements.
+      // It provides a lower bound for the max count in the intersection.
+      // This condition is very conservative as we do not check whether such an event
+      // actually occurred, although it is very likely under a uniform distribution,
+      // the distribution with the highest uncertainty (maximizes entropy).
+      const auto max_count_lower_bound = larger._nnz() / sparse_dim_numel;
+      constexpr int64_t MAX_COPIES_PER_THREAD = 50;
+      return max_count_lower_bound > MAX_COPIES_PER_THREAD
+        ? std::make_tuple(larger.coalesce(), smaller)
+        : std::make_tuple(larger, smaller);
+    }
+  }();
+
+  // The employed hash function maps a d-dim index to a linear offset
+  // into a contiguous memory that is sufficient to fit a dense tensor
+  // of shape broadcasted_shape(x.shape, y.shape), i.e.
+  // idx -> \sum_{i = 0}^d idx[i] * hash_coeffs[i], where
+  // hash_coeffs are the strides of a contiguous tensor of shape
+  // broadcasted_shape(x.shape, y.shape).
+  // Assuming the following order on the dimensions, i.e. the right-most dim is the
+  // fastest-changing dim, and the left-most is the slowest-changing dim,
+  // which is implicit in the definition of hash_coeffs,
+  // it could be shown that the hash function is actually bijective and, hence,
+  // is a perfect hash function (no collisions ever).
+  const auto kHash = std::is_same<hash_t, int64_t>::value ? kLong : kInt;
+  const auto hash_coeffs = [&]() -> Tensor {
+    const auto broadcasted_sparse_dim_shape = std::vector<int64_t>(
+      broadcasted_shape.begin(),
+      broadcasted_shape.begin() + probably_coalesced.sparse_dim()
+    );
+    auto strides = contiguous_strides(broadcasted_sparse_dim_shape);
+    auto strides_len = static_cast<int64_t>(strides.size());
+    auto hash_coeffs = at::empty(
+        {strides_len},
+        probably_coalesced._indices().options().device(kCPU).dtype(kHash));
+    // Copy with a potential casting. Is there a nicer way?
+    for (const auto i : c10::irange(strides_len)) {
+      hash_coeffs[i] = strides[i];
+    }
+    hash_coeffs = hash_coeffs.to(probably_coalesced.device());
+    return hash_coeffs;
+  }();
+
+  const auto nnz_arange = at::arange(
+      std::max(probably_coalesced._nnz(), source._nnz()),
+      source._indices().options());
+  const auto probably_coalesced_nnz_arange = nnz_arange.narrow(-1, 0, probably_coalesced._nnz());
+
+  // non-const because of gcc-5/clang-5 issues
+  auto sparse_dim = probably_coalesced.sparse_dim();
+  // non-const because of gcc-5/clang-5 issues
+  auto sdim = static_cast<uint32_t>(sparse_dim);
+
+  // Apply the hash function to probably_coalesced.indices
+  const auto probably_coalesced_indices_hash = [&]() -> Tensor {
+    const auto indices = probably_coalesced._indices();
+    // non-const because of gcc-5/clang-5 issues
+    auto indices_dim_stride = indices.stride(0);
+    auto indices_nnz_stride = indices.stride(1);
+
+    auto hash = at::empty({probably_coalesced._nnz()},
+        indices.options().dtype(kHash));
+
+    auto iter = TensorIteratorConfig()
+      // Hash has hash_t type while probably_coalesced_nnz_arange is index_t.
+      .check_all_same_dtype(false)
+      .add_output(hash)
+      .add_input(probably_coalesced_nnz_arange)
+      .build();
+
+    {
+      const auto* RESTRICT ptr_indices = indices.data_ptr<index_t>();
+      const auto* RESTRICT ptr_hash_coeffs = hash_coeffs.template data_ptr<hash_t>();
+
+      KernelLauncher::launch(iter,
+          // NOTE: capture by value required by CUDA
+          [=] FUNCAPI (index_t nnz_idx) -> hash_t {
+          const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride;
+          auto hash = hash_t {0};
+          for (uint32_t dim = 0; dim < sdim; ++dim) {
+            // use only int32_t operations when hash_t == int32_t
+            const auto dim_hash_coeff = ptr_hash_coeffs[dim];
+            const auto dim_index = static_cast<hash_t>(ptr_indices_dim[dim * indices_dim_stride]);
+            hash += dim_index * dim_hash_coeff;
+          }
+          return hash;
+      });
+    }
+
+    return hash;
+  }();
+
+  // Now that we have hash values of probably_coalesced.indices,
+  // we need to decide whether they need to get sorted.
+  // The sort is not requires if probably_coalesced is coalesced.
+  Tensor sorted_hash, argsort_hash;
+  std::tie(sorted_hash, argsort_hash) = [&]() -> std::tuple<Tensor, Tensor> {
+    if (probably_coalesced.is_coalesced()) {
+      // NOTE: argsort.dtype == nnz_arange.dtype
+      const auto argsort = nnz_arange.narrow(-1, 0, probably_coalesced._nnz());
+      return std::make_tuple(probably_coalesced_indices_hash, argsort);
+    }
+    else {
+      // NOTE: we want argsort.dtype == nnz_arange.dtype,
+      // but sort() produces indices of type int64_t,
+      // so we convert to nnz_arange.dtype to avoid issues
+      // with pointer types in the kernels below.
+      Tensor sorted, argsort;
+      std::tie(sorted, argsort) = probably_coalesced_indices_hash.sort();
+      return std::make_tuple(sorted, argsort.to(nnz_arange.scalar_type()));
+    }
+  }();
+
+  // Perform hash intersection.
+  // Let  s_hash = hash(source.indices),
+  //     pc_hash = hash(probably_coalesced.indices), then
+  // for i = 0, ..., len(s_hash) - 1:
+  //     lb = <index of a value in pc_hash[argsort_hash] which is a lower bound for s_hash[i]>,
+  //     up = <index of a value in pc_hash[argsort_hash] which is an upper bound for s_hash[i]>,
+  //     intersection_count[i] = up - lb
+  //     intersection_first_idx[i] = lb.
+  //
+  // intersection_count and intersection_first_idx are used to form indices at which
+  // intersection values are selected.
+  Tensor intersection_count, intersection_first_idx;
+  std::tie(intersection_count, intersection_first_idx) = [&]() -> std::tuple<Tensor, Tensor> {
+    const auto source_nnz = source._nnz();
+    auto intersection_buffer = at::empty({2, source_nnz}, sorted_hash.options());
+    auto intersection_count = intersection_buffer.select(0, 0);
+    auto intersection_first_idx = intersection_buffer.select(0, 1);
+
+    const auto source_indices = source._indices();
+    const auto source_arange = nnz_arange.narrow(-1, 0, source_nnz);
+    // non-const because of gcc-5/clang-5 issues
+    auto indices_dim_stride = source_indices.stride(0);
+    auto indices_nnz_stride = source_indices.stride(1);
+    auto dummy = at::empty({1}, source_arange.options());
+
+    auto iter = TensorIteratorConfig()
+      .set_check_mem_overlap(false)
+      .add_owned_output(dummy.expand_as(source_arange))
+      .add_input(source_arange)
+      .build();
+
+    {
+      const auto* RESTRICT ptr_indices = source_indices.data_ptr<index_t>();
+      const auto* RESTRICT ptr_sorted_hash = sorted_hash.data_ptr<hash_t>();
+      const auto sorted_hash_len = sorted_hash.numel();
+      const auto* RESTRICT ptr_hash_coeffs = hash_coeffs.template data_ptr<hash_t>();
+      auto* RESTRICT ptr_intersection_count = intersection_count.data_ptr<hash_t>();
+      auto* RESTRICT ptr_intersection_first_idx = intersection_first_idx.data_ptr<hash_t>();
+
+      // Fusing hash computation with hash intersection.
+      KernelLauncher::launch(iter,
+          // NOTE: capture by value required by CUDA
+          [=] FUNCAPI (index_t nnz_idx) -> index_t {
+          // Compute hash value
+          const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride;
+          auto hash = hash_t {0};
+          for (uint32_t dim = 0; dim < sdim; ++dim) {
+            // Use only int32_t operations when hash_t == int32_t.
+            const auto dim_hash_coeff = ptr_hash_coeffs[dim];
+            const auto dim_index = static_cast<hash_t>(ptr_indices_dim[dim * indices_dim_stride]);
+            hash += dim_index * dim_hash_coeff;
+          }
+
+          // Perform hash values intersection
+          const auto* RESTRICT lb = find_bound<const hash_t*, hash_t, /*is_lower=*/true>(
+              ptr_sorted_hash,
+              ptr_sorted_hash + sorted_hash_len,
+              hash
+          );
+
+          const auto* RESTRICT ub = find_bound<const hash_t*, hash_t, /*is_lower=*/false>(
+              ptr_sorted_hash,
+              ptr_sorted_hash + sorted_hash_len,
+              hash
+          );
+
+          ptr_intersection_count[nnz_idx] = ub - lb;
+          ptr_intersection_first_idx[nnz_idx] = lb - ptr_sorted_hash;
+
+          return 0;
+      });
+    }
+
+    return std::make_tuple(intersection_count, intersection_first_idx);
+  }();
+
+  // Using intersection_count and intersection_first_idx,
+  // form indices selected_source and selected_probably_coalesced such that
+  // res.values = op(
+  //  source.values.index_select(0, selected_source),
+  //  probably_coalesced.values.index_select(0, selected_probably_coalesced)) and
+  // res.indices = selected_source_sparse_indices, which is also equivalent to
+  // res.indices = source.indices.index_select(1, selected_source).
+  Tensor selected_source, selected_source_sparse_indices, selected_probably_coalesced;
+  std::tie(selected_source, selected_source_sparse_indices, selected_probably_coalesced)
+    = [&]() -> std::tuple<Tensor, Tensor, Tensor> {
+    // Thread offset = shifted_offset - shift.
+    // This computation is fused in kernels below.
+
+    // hash_t might not be enough to store offset values, so we use
+    // offset_t which is at least sizeof(hash_t).
+    const auto kOffset = std::is_same<offset_t, int32_t>::value ? kInt : kLong;
+    const auto shifted_offset = intersection_count.cumsum(-1, kOffset);
+
+    // NOTE: unavoidable sync to get to know the result's shape.
+    const auto intersection_nnz = static_cast<int64_t>(
+        // shifted_offset is a 1-dim tensor, potentially empty
+        shifted_offset.size(0)
+        ? shifted_offset.select(-1, -1).template item<offset_t>()
+        : 0);
+
+    auto selected_buffer = at::empty({2, intersection_nnz}, intersection_count.options());
+    auto selected_source = selected_buffer.select(0, 0);
+    auto selected_probably_coalesced = selected_buffer.select(0, 1);
+    const auto source_sparse_indices = source._indices();
+    auto selected_source_sparse_indices = at::empty({source.sparse_dim(), intersection_nnz},
+        source_sparse_indices.options().memory_format(at::MemoryFormat::Contiguous));
+    const auto source_idx = nnz_arange.narrow(-1, 0, source._nnz());
+    auto dummy = at::empty({1}, source_idx.options());
+
+    auto iter = TensorIteratorConfig()
+      .set_check_mem_overlap(false)
+      .check_all_same_dtype(false)
+      .add_owned_output(dummy.expand_as(source_idx))
+      .add_input(source_idx) // index_t
+      .add_input(intersection_count) // hash_t
+      .add_input(intersection_first_idx) // hash_t
+      .add_input(shifted_offset) // offset_t
+      .build();
+
+    {
+      auto* RESTRICT ptr_selected_source = selected_source.data_ptr<hash_t>();
+      auto* RESTRICT ptr_selected_probably_coalesced = selected_probably_coalesced.data_ptr<hash_t>();
+      const auto* RESTRICT ptr_argsort = argsort_hash.data_ptr<index_t>();
+
+      auto* RESTRICT ptr_selected_source_sparse_indices = selected_source_sparse_indices.data_ptr<index_t>();
+      // Non-const because of Gcc5/Clang5 issues
+      auto selected_source_sparse_indices_nnz_stride = static_cast<offset_t>(
+          selected_source_sparse_indices.stride(1));
+      auto selected_source_sparse_indices_dim_stride = static_cast<offset_t>(
+          selected_source_sparse_indices.stride(0));
+
+      const auto* RESTRICT ptr_source_sparse_indices = source_sparse_indices.data_ptr<index_t>();
+      // Non-const because of Gcc5/Clang5 issues
+      auto source_sparse_indices_nnz_stride = static_cast<offset_t>(
+          source_sparse_indices.stride(1));
+      auto source_sparse_indices_dim_stride = static_cast<offset_t>(
+          source_sparse_indices.stride(0));
+
+      KernelLauncher::launch(iter,
+          // NOTE: capture by value required by CUDA
+          [=] FUNCAPI (
+            index_t idx,
+            hash_t count,
+            hash_t first_match_idx,
+            offset_t shifted_offset) -> index_t {
+          const auto offset = shifted_offset - static_cast<offset_t>(count);
+          auto* RESTRICT ptr_selected_source_idx_out = ptr_selected_source + offset;
+          auto* RESTRICT ptr_selected_probably_coalesced_idx_out = ptr_selected_probably_coalesced + offset;
+          const auto* RESTRICT ptr_argsort_idx = ptr_argsort + first_match_idx;
+
+          auto* RESTRICT ptr_selected_source_sparse_indices_out =
+            ptr_selected_source_sparse_indices + offset * selected_source_sparse_indices_nnz_stride;
+          const auto* RESTRICT ptr_source_sparse_indices_in =
+            ptr_source_sparse_indices + idx * source_sparse_indices_nnz_stride;
+
+          for (hash_t i = 0; i < count; ++i) {
+            *ptr_selected_source_idx_out++ = idx;
+            *ptr_selected_probably_coalesced_idx_out++ = *ptr_argsort_idx++;
+
+            // res_indices = source._indices().index_select(1, selected_source)
+            // The code below fuses this computation with forming
+            // selected_source and selected_probably_coalesced.
+            for (uint32_t d = 0; d < sdim; ++d) {
+              ptr_selected_source_sparse_indices_out[d * selected_source_sparse_indices_dim_stride]
+                = ptr_source_sparse_indices_in[d * source_sparse_indices_dim_stride];
+            }
+            ptr_selected_source_sparse_indices_out += selected_source_sparse_indices_nnz_stride;
+          }
+
+          return 0;
+      });
+    }
+
+    return std::make_tuple(selected_source, selected_source_sparse_indices, selected_probably_coalesced);
+  }();
+
+  const auto res_indices = selected_source_sparse_indices;
+  // TODO: fuse 3 next kernel calls into 1.
+  const auto selected_source_values = source._values().index_select(0, selected_source);
+  const auto selected_probably_coalesced_values = probably_coalesced._values().index_select(0, selected_probably_coalesced);
+  const auto res_values = binary_op_t::apply(selected_source_values, selected_probably_coalesced_values)
+    // no-op for out-of-place calls, but we still need to cast when the op is supposed to be performed in-place
+    // but binary_op_t promotes types. For example, let the op == mul, x.dtype == int8, y.dtype == uint8,
+    // then mul(x, y).dtype == int16, while x.mul_(y).dtype == int8 and y.mul_(x).dtype == uint8.
+    .to(res.scalar_type());
+  const auto res_sparse_dim = source.sparse_dim();
+  const auto res_dense_dim = res_values.dim() - 1;
+  const auto res_shape = broadcasted_shape;
+  const auto res_nnz = selected_source_values.size(0);
+
+  auto* res_sparse_impl = get_sparse_impl(res);
+  res_sparse_impl->raw_resize_(res_sparse_dim, res_dense_dim, res_shape);
+  res_sparse_impl->set_indices_and_values_unsafe(res_indices, res_values);
+  res_sparse_impl->set_nnz_and_narrow(res_nnz);
+  // Result is coalesced iff arguments are coalesced, conditioned on the fact
+  // that we do not check that intersection hash values are sorted and unique.
+  // <= : intersection contains only unique indices (or empty), and the algorithm's
+  // behavior is order-preserving. So, the result has only unique indices (or empty) which are sorted.
+  // => : proof by contraposition. The contrapositive statement reads
+  // `there is an uncoalesced argument => result is not coalesced`.
+  // If both arguments are uncoalesced, the result is clearly uncoalesced again
+  // thanks to the order-preserving behavior of the algorithm.
+  // Otherwise we have a coalesced argument `probably_coalesced` and an uncoalesced `source`.
+  // Since the matching beahavior of the algorithm respects the order of `source`, the result
+  // will be as coalesced as `source` is, which is uncoalesced.
+  res._coalesced_(source.is_coalesced() && probably_coalesced.is_coalesced());
+}
+
+template <
+  template <typename func_t> class kernel_t,
+  typename binary_op_t>
+void _sparse_binary_op_intersection_kernel_out(
+    Tensor& res,
+    const Tensor& x,
+    const Tensor& y,
+    const bool commutes_with_sum = true
+) {
+  TORCH_CHECK(
+      (x.is_sparse() && y.is_sparse())
+      && (x.dim() == y.dim()) && (x.sparse_dim() == y.sparse_dim())
+      && (x.sizes().slice(0, x.sparse_dim()) == y.sizes().slice(0, y.sparse_dim())),
+      NAME, "(): expects sparse inputs with equal dimensionality, ",
+      "number of sparse dimensions, and shape of sparse dimensions");
+  TORCH_CHECK(
+      x._indices().scalar_type() == y._indices().scalar_type(),
+      NAME, "(): expects inputs' indices to be of the same dtype (i.e. long or int)");
+
+  const auto broadcasted_shape = infer_size(x.sizes(), y.sizes());
+
+  int64_t max_hash_val = 1;
+  for (const auto d : c10::irange(x.sparse_dim())) {
+    max_hash_val *= broadcasted_shape[d];
+  }
+
+  const auto is_32bit_indexing = x._indices().scalar_type() == at::kInt;
+  // Optimization: use 32-bit hash values when possible.
+  const auto is_max_hash_32bits = max_hash_val <= std::numeric_limits<int>::max();
+  // Intersection nnz could get larger than nnz of either arguments.
+  // Example: probably_coalesced and source have only one unique and shared index,
+  // then the size of intersection is exactly the product of their nnzs.
+  // This nnz defines offsets per thread which are computed using cumsum on values
+  // of hash dtype. This becomes a problem when hash_t=int32_t and res_nnz > max(int32_t).
+  const auto is_max_offset_32bits = (x._nnz() * y._nnz()) <= std::numeric_limits<int>::max();
+
+  BOOL_TO_INDEX_TYPE3(is_32bit_indexing, is_max_hash_32bits, is_max_offset_32bits, [&]() {
+      // Given 3 booleans b0, b1, b2, index_t<i> is defined as
+      // index_t<i> = int32_t if b<2 - i> is true else int64_t.
+      // The goal is to use int32_t whenever possible for better
+      // performance.
+      // NOTE: order of types given booleans is reversed.
+      using index_t = index_t2;
+      using hash_t = index_t1;
+      using offset_t = index_t0;
+      _sparse_binary_op_intersection_kernel_impl<kernel_t, binary_op_t, index_t, hash_t, offset_t>(
+          res, x, y, broadcasted_shape, commutes_with_sum);
+  });
+}
+
+} // anonymous namespace
+
+}} // at::native
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
index f52e3f2ef641..0b85589f1184 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@@ -163,7 +163,7 @@ Tensor& unary_op_out(F op_out, const Tensor& self, Tensor& result) {
 
 template <typename F, typename... Args>
 Tensor& unary_op_inplace(Tensor& self, const F& op_inplace, Args&&... args) {
-  TORCH_INTERNAL_ASSERT(self.is_sparse_csr());
+  AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(self.layout(), "unary_op_inplace", [](){});
 
   auto self_values = self.values();
   (self_values.*op_inplace)(std::forward<Args>(args)...);
@@ -299,6 +299,23 @@ Tensor mul_scalar_sparse_csr(const Tensor& self, const Scalar& other) {
       result_values.device());
 }
 
+Tensor& zero_sparse_csr_(Tensor& self) {
+  /*
+    csr.zero_() resets nnz to 0.
+
+    If the original sparsity pattern needs to be preserved, use
+    `csr.values().zero_()` instead.
+
+    The above behavior also implies that torch.zeros_like(csr) returns
+    a new tensor with nnz == 0. If one needs a zeros_like semantics
+    where the result has the same sparsity pattern as input, then use
+    `result = csr.clone(); result.values.zero_();`
+  */
+  AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(self.layout(), "zero_sparse_csr_", [](){});
+  get_sparse_csr_impl(self)->resize_and_clear_(self.sparse_dim(), self.sizes());
+  return self;
+}
+
 /* Implementation of Unary Ufuncs, those supported for Sparse CSR Layout
  * Only simple funcs, with 0->0 correspondence are currently supported. */
 
@@ -350,8 +367,6 @@ CREATE_UNARY_UFUNC(tanh);
 CREATE_UNARY_UFUNC(trunc);
 CREATE_UNARY_UFUNC(conj_physical);
 
-CREATE_UNARY_UFUNC_INPLACE(zero);
-
 // With addition of `round.decimals` overload, using CREATE_UNARY_UFUNC leads
 // to unresolved overload.
 Tensor& round_sparse_csr_out(const Tensor& self, Tensor& result) {
diff --git a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
index 7d71d2104e5b..48130d94f0d1 100644
--- a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
+++ b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
@@ -3,6 +3,7 @@
 #include <ATen/Tensor.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/Dispatch.h>
+#include <ATen/native/sparse/Macros.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -13,23 +14,12 @@
 #include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors.h>
 #endif
 
-#if defined(__CUDACC__) || defined(__HIPCC__)
-#define GPUCC
-#define FUNCAPI __host__ __device__
-#define INLINE __forceinline__
+#ifdef GPUCC
 #define NAME "compressed_index_invariance_checks_cuda"
 #else
-#define FUNCAPI
-#define INLINE inline
 #define NAME "compressed_index_invariance_checks_cpu"
 #endif
 
-#if defined(_WIN32) || defined(_WIN64)
-#define RESTRICT __restrict
-#else
-#define RESTRICT __restrict__
-#endif
-
 #define INVARIANT_CHECK_FUNC_API static INLINE FUNCAPI void
 
 namespace at {
diff --git a/aten/src/ATen/native/transformers/attention.cpp b/aten/src/ATen/native/transformers/attention.cpp
index 6a6a6daafd86..82801b268a0a 100644
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@@ -262,7 +262,7 @@ std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv_cpu(
     const Tensor& qkv_bias,
     const int64_t num_head) {
   auto qkv_ = qkv.is_nested()
-    ? c10::MaybeOwned<Tensor>::owned(qkv.to_padded_tensor(0))
+    ? c10::MaybeOwned<Tensor>::owned(nested_to_padded_tensor(qkv, 0))
     : c10::MaybeOwned<Tensor>::borrowed(qkv);
   auto B = qkv_->size(0);
   auto T = qkv_->size(1);
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/epilogue.h b/aten/src/ATen/native/transformers/cuda/flash_attn/epilogue.h
new file mode 100644
index 000000000000..65c3180a9c88
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/epilogue.h
@@ -0,0 +1,149 @@
+/******************************************************************************
+ * Copyright (c) 2022, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cutlass/cutlass.h>
+#include <cutlass/epilogue/threadblock/default_epilogue_tensor_op.h>
+#include <cutlass/epilogue/threadblock/default_thread_map_tensor_op.h>
+#include <cutlass/epilogue/warp/fragment_iterator_tensor_op.h>
+#include <cutlass/gemm/warp/default_mma_tensor_op.h>
+#include <cutlass/layout/layout.h>
+#include <cutlass/arch/mma.h>
+#include <cutlass/array.h>
+#include <cutlass/numeric_types.h>
+
+#include <ATen/native/transformers/cuda/flash_attn/gemm.h>
+#include <ATen/native/transformers/cuda/flash_attn/epilogue_predicated_tile_iterator.h>
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename MmaCore>
+struct FMHAEpilogue {
+
+    using ThreadblockShape = typename MmaCore::Shape;
+    using WarpMma = typename MmaCore::MmaTensorOp;
+    using LayoutC = typename MmaCore::LayoutC;
+    using Element = typename MmaCore::ElementA;
+    using ElementC = typename MmaCore::ElementC;
+
+    static constexpr int kPartitionsK = ThreadblockShape::kK / MmaCore::WarpShape::kK;
+
+    using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                    typename WarpMma::Shape,
+                                    typename WarpMma::Policy::Operator::Shape,
+                                    typename WarpMma::Policy::Operator::ElementC,
+                                    typename WarpMma::Policy::Operator::FragmentC,
+                                    LayoutC>;
+    using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+    static constexpr int kIterationsStore = AccumulatorFragmentIterator::kIterations;
+
+    // Maybe elementsPerAccess should vary: 4 for d=64, 2 for d=32?
+    using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+        ThreadblockShape, typename WarpMma::Shape, kPartitionsK, Element, /*ElementsPerAccess=*/4>::Type;
+    using OutputTileThreadMapAccum = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+        ThreadblockShape, typename WarpMma::Shape, kPartitionsK, ElementC, /*ElementsPerAccess=*/4>::Type;
+
+    using GmemIterator = fmha::EpiloguePredicatedTileIterator<
+        OutputTileThreadMap,
+        Element
+    >;
+    // which ThreadMap should we use?
+    using GmemIteratorAccum = fmha::EpiloguePredicatedTileIterator<
+        // OutputTileThreadMapAccum,
+        OutputTileThreadMap,
+        ElementC
+    >;
+
+
+    using DefaultIterators = cutlass::epilogue::threadblock::detail::DefaultIteratorsTensorOp<
+        Element, ElementC, /*ElementsPerAccess=*/4, ThreadblockShape, typename WarpMma::Shape,
+        typename WarpMma::Policy::Operator::Shape, typename OutputTileThreadMap::CompactedThreadMap>;
+    using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+    static_assert(WarpTileIterator::kIterations == kIterationsStore);
+    using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+    using OutputFragment = typename SharedLoadIterator::Fragment;
+
+    // using Padding = cutlass::MatrixShape<0, 0>;
+    using Padding = cutlass::MatrixShape<0, 64 / cutlass::sizeof_bits<ElementC>::value * 4>;
+    static constexpr int kFragmentsPerIteration = kIterationsStore;  // TODO: could be 1 for Volta?
+    /*Using kIterationsStore here so that we get the right storage size*/
+    using EpilogueBase = typename cutlass::epilogue::threadblock::EpilogueBase<
+        ThreadblockShape, typename WarpMma::Shape, kPartitionsK, AccumulatorFragmentIterator, WarpTileIterator,
+        Padding, kIterationsStore>;
+
+    using SharedStorage = typename EpilogueBase::SharedStorage;
+    static constexpr int kSmemTiles = EpilogueBase::kFragmentsPerIteration;
+    static constexpr int kSmemPointerOffset = SharedStorage::StorageShape::kCount / kSmemTiles;
+    static constexpr int kSmemPointerOffsetPerWarp = SharedStorage::StorageShape::kCount / (kSmemTiles * kPartitionsK);
+
+    SharedStorage *shared_storage;
+    WarpTileIterator warp_tile_iterator;
+
+    inline __device__ FMHAEpilogue(void *smem, const int tidx)
+        : shared_storage(reinterpret_cast<SharedStorage *>(smem))
+        , warp_tile_iterator(shared_storage->reference(), threadIdx.x % 32) {
+
+        // const int warp_idx = tidx / 32;
+        // Broadcast the warp_id computed by lane 0 to ensure dependent code
+        // is compiled as warp-uniform.
+        // https://github.com/NVIDIA/cutlass/blob/e66bfcb1f880792caa46b1e983c4114e23afa5f3/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h#L520
+        const int warp_idx = __shfl_sync(0xffffffff, tidx / 32, 0);
+
+        cutlass::MatrixCoord warp_offset{kIterationsStore * warp_idx, 0};
+
+        warp_tile_iterator.add_tile_offset(warp_offset);
+    }
+
+    // Store the accumulators.
+    inline __device__ void store(const AccumulatorTile &acc) {
+        AccumulatorFragmentIterator accum_fragment_iterator(acc);
+        CUTLASS_PRAGMA_UNROLL
+        for (int p = 0; p < kIterationsStore; ++p) {
+            typename AccumulatorFragmentIterator::Fragment accum_fragment;
+            accum_fragment_iterator.load(accum_fragment);
+            ++accum_fragment_iterator;
+
+            warp_tile_iterator.store(accum_fragment);
+            if (p < kIterationsStore - 1) {
+                warp_tile_iterator.add_pointer_offset(kSmemPointerOffsetPerWarp);
+            }
+        }
+        if (kIterationsStore > 1) {
+            warp_tile_iterator.add_pointer_offset((1 - kIterationsStore) * kSmemPointerOffsetPerWarp);
+        }
+    }
+
+    // Load the accumulators
+    template<bool zero_init=true>
+    inline __device__ void load(OutputFragment (&out)[kFragmentsPerIteration],
+                                const int tidx) {
+        SharedLoadIterator shared_load_iterator(shared_storage->reference(), tidx);
+        CUTLASS_PRAGMA_UNROLL
+        for (int p = 0; p < EpilogueBase::kFragmentsPerIteration; ++p) {
+            OutputFragment aligned_accum_fragment[kPartitionsK];
+            shared_load_iterator.load(aligned_accum_fragment[0]);
+            cutlass::plus<OutputFragment> add_fragments;
+            if (kPartitionsK > 1) {
+                CUTLASS_PRAGMA_UNROLL
+                for ( int i = 1; i < kPartitionsK; ++i) {
+                    shared_load_iterator.add_pointer_offset(kSmemPointerOffsetPerWarp * kIterationsStore);
+                    shared_load_iterator.load(aligned_accum_fragment[i]);
+                    aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+                }
+                shared_load_iterator.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffsetPerWarp * kIterationsStore);
+            }
+            if (p < EpilogueBase::kFragmentsPerIteration - 1) {
+                shared_load_iterator.add_pointer_offset(kSmemPointerOffsetPerWarp);
+            }
+
+            out[p] = zero_init ? aligned_accum_fragment[0] : add_fragments(out[p], aligned_accum_fragment[0]);
+        }
+    }
+
+};
+
+}  // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/epilogue_predicated_tile_iterator.h b/aten/src/ATen/native/transformers/cuda/flash_attn/epilogue_predicated_tile_iterator.h
new file mode 100644
index 000000000000..170df703e7da
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/epilogue_predicated_tile_iterator.h
@@ -0,0 +1,493 @@
+// Adapted from cutlass/epilogue/threadblock/predicated_tile_iterator.h
+// We just want to add the move() function, but idk how to do it without
+// copying the code here.
+
+/******************************************************************************
+ * Copyright (c) 2022, Tri Dao.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cutlass/cutlass.h>
+#include <cutlass/arch/arch.h>
+#include <cutlass/arch/memory.h>
+#include <cutlass/array.h>
+#include <cutlass/epilogue/threadblock/output_tile_thread_map.h>
+#include <cutlass/epilogue/threadblock/predicated_tile_iterator_params.h>
+#include <cutlass/layout/matrix.h>
+#include <cutlass/layout/tensor.h>
+#include <cutlass/matrix_shape.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/tensor_ref.h>
+#include <cutlass/transform/pitch_linear_thread_map.h>
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////
+
+using namespace cutlass;
+using namespace cutlass::epilogue::threadblock;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,         ///< Element data type
+  bool ScatterD = false,     ///< Scatter D operand or not
+  bool UseCUDAStore = false
+>
+class EpiloguePredicatedTileIterator {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
+  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
+  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
+  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<
+    Element,
+    ThreadMap::Iterations::kColumn *
+    ThreadMap::Iterations::kRow *
+    ThreadMap::Iterations::kGroup *
+    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+    using Base = PredicatedTileIteratorParams;
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout):
+      PredicatedTileIteratorParams(
+        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+        make_OutputTileThreadMapDesc<ThreadMap>()
+      )
+    { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) :
+      Base(base) { }
+  };
+
+  /// Mask object
+  struct Mask {
+
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  PredicatedTileIteratorParams params_;
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_column_;
+
+  /// A thread's starting row position (assuming steady-state predicates have been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column
+  Index thread_start_column_;
+
+  /// Internal state counter
+  int state_[3];
+
+  /// Scatter indices
+  int const *indices_;
+
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");
+
+private:
+
+  //
+  // Methods
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpiloguePredicatedTileIterator(
+    PredicatedTileIteratorParams const & params,
+    Element *pointer,
+    TensorCoord extent,
+    int thread_idx,
+    TensorCoord threadblock_offset = TensorCoord(),
+    int const *indices = nullptr
+  ):
+    params_(params), indices_(indices)
+  {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    extent_row_ = extent.row();
+    extent_column_ = extent.column();
+
+    thread_start_row_ = thread_offset.row();
+    thread_start_column_ = thread_offset.column();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+
+      mask_.predicates[c] = ((thread_offset.column()
+        + ThreadMap::Delta::kColumn * c) < extent.column());
+    }
+
+    // Null pointer performs no accesses
+    if (!pointer) {
+      mask_.clear();
+    }
+
+    if (ScatterD && !indices) {
+      mask_.clear();
+    }
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
+      LongIndex(thread_offset.row()) * LongIndex(params_.stride) +
+      LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+
+    if (ScatterD) {
+      byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
+        LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+    }
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) const {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx =
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow
+            + group * ThreadMap::Delta::kGroup
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset +
+              LongIndex(indices_[row_offset + thread_start_row_]) * LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<
+              AccessType,
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) const {
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx =
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow
+            + group * ThreadMap::Delta::kGroup
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset +
+              LongIndex(indices_[row_offset + thread_start_row_]) * LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            if (UseCUDAStore) {
+              if (guard) {
+                memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess] =
+                    frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column];
+              }
+            } else {
+              cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                  frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                  (void *)&memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                  guard);
+            }
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) const {
+
+    store_with_byte_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  MatrixCoord thread_start() const {
+    return MatrixCoord(thread_start_row_, thread_start_column_);
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_row() const {
+    return thread_start_row_;
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_column() const {
+    return thread_start_column_;
+  }
+
+  /// Extent of the matrix in rows
+  CUTLASS_DEVICE
+  Index extent_row() const {
+    return extent_row_;
+  }
+
+  /// Extent of the matrix in columns
+  CUTLASS_DEVICE
+  Index extent_column() const {
+    return extent_column_;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  void move(const int step=1) {
+
+    if (!ScatterD) {
+      byte_pointer_ += step * params_.advance_row;
+    }
+
+    thread_start_row_ += step * ThreadMap::Shape::kRow;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) const {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+} // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha.h b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha.h
new file mode 100644
index 000000000000..d259280fac52
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha.h
@@ -0,0 +1,154 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cuda.h>
+#include <vector>
+
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
+#include <ATen/native/transformers/cuda/flash_attn/fmha_utils.h>
+
+
+constexpr int TOTAL_DIM = 0;
+constexpr int H_DIM = 1;
+constexpr int D_DIM = 2;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Qkv_params {
+    // The QKV matrices.
+    void *__restrict__ q_ptr;
+    void *__restrict__ k_ptr;
+    void *__restrict__ v_ptr;
+
+    // The stride between rows of the Q, K and V matrices.
+    // size_t qkv_stride_in_elts;
+    // size_t qkv_stride_in_bytes;
+    // TD [2022-04-16]: We're using 32-bit indexing to save registers.
+    // The code probably won't work for arrays larger than 2GB.
+    uint32_t q_row_stride_in_elts;
+    uint32_t k_row_stride_in_elts;
+    uint32_t v_row_stride_in_elts;
+    uint32_t q_head_stride_in_elts;
+    uint32_t k_head_stride_in_elts;
+    uint32_t v_head_stride_in_elts;
+
+    // The number of heads.
+    int h;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct FMHA_fprop_params : public Qkv_params {
+
+    // The O matrix (output).
+    void * __restrict__ o_ptr;
+
+    // The stride between rows of O.
+    // size_t o_stride_in_elts;
+    // size_t o_stride_in_bytes;
+    uint32_t o_row_stride_in_elts;
+    uint32_t o_head_stride_in_elts;
+
+    // The pointer to the O_tmp matrix, which holds O intermediate value during
+    // the loop;
+    void *__restrict__ o_tmp_ptr;
+
+    // The pointer to the S matrix.
+    void * __restrict__ s_ptr;
+    // The stride between rows of the S matrix.
+    // int64_t s_stride_in_bytes;
+    uint32_t s_stride_in_bytes;
+
+    // The pointer to the softmax sum.
+    void * __restrict__ softmax_lse_ptr;
+
+    // The dimensions.
+    int b, seqlen_q, seqlen_k, d;
+
+    // The scaling factors for the kernel.
+    float scale_bmm1;
+
+    // array of length b+1 holding starting offset of each sequence.
+    int * __restrict__ cu_seqlens_q;
+    int * __restrict__ cu_seqlens_k;
+
+    int *__restrict__ blockmask;
+
+    // The dropout probability (probability of keeping an activation).
+    float p_dropout;
+    uint32_t p_dropout_in_uint;
+    uint16_t p_dropout_in_uint16_t;
+
+    // Scale factor of 1 / (1 - p_dropout).
+    float rp_dropout;
+    float scale_bmm1_rp_dropout;
+
+    // Random state.
+    at::PhiloxCudaState philox_args;
+
+    bool is_bf16;
+    bool is_causal;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Kernel_params>
+struct Launch_params{
+    Launch_params(cudaDeviceProp * props_,
+                  cudaStream_t stream_,
+                  bool is_dropout_,
+                  bool return_softmax_)
+        : elts_per_thread(0)
+        , props(props_)
+        , stream(stream_)
+        , is_dropout(is_dropout_)
+        , return_softmax(return_softmax_) {
+    }
+
+    size_t elts_per_thread;
+
+    cudaDeviceProp * props;
+
+    cudaStream_t stream;
+
+    bool is_dropout;
+    bool return_softmax;
+
+    Kernel_params params;
+    int num_full_heads;
+    int num_main_groups;
+    int heads_last_wave;
+    int main_steps;
+    int rest_steps;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void run_fmha_fprop(Launch_params<FMHA_fprop_params> &launch_params, const bool configure);
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
new file mode 100644
index 000000000000..691465c5354c
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
@@ -0,0 +1,243 @@
+/******************************************************************************
+ * Copyright (c) 2022, Tri Dao.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/NativeFunctions.h>
+
+#include <ATen/native/transformers/cuda/flash_attn/fmha.h>
+#include <ATen/native/transformers/cuda/flash_attn/fmha_api.h>
+
+#include <c10/util/Exception.h>
+
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == at::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+
+namespace fmha {
+
+void set_params_fprop(FMHA_fprop_params &params,
+                      // sizes
+                      const size_t b,
+                      const size_t seqlen_q,
+                      const size_t seqlen_k,
+                      const size_t h,
+                      const size_t d,
+                      // device pointers
+                      const at::Tensor q,
+                      const at::Tensor k,
+                      const at::Tensor v,
+                      void *cu_seqlens_q_d,
+                      void *cu_seqlens_k_d,
+                      void *o_packed_d,
+                      void *o_tmp_d,
+                      void *s_d,
+                      void *softmax_lse_d,
+                      float p_dropout,
+                      float softmax_scale,
+                      bool is_causal) {
+
+    // Reset the parameters
+    memset(&params, 0, sizeof(params));
+
+    params.is_bf16 = q.dtype() == at::kBFloat16;
+
+    // Set the pointers and strides.
+    params.q_ptr = q.data_ptr();
+    params.k_ptr = k.data_ptr();
+    params.v_ptr = v.data_ptr();
+    params.q_row_stride_in_elts = q.stride(0);
+    params.k_row_stride_in_elts = k.stride(0);
+    params.v_row_stride_in_elts = v.stride(0);
+    params.q_head_stride_in_elts = q.stride(1);
+    params.k_head_stride_in_elts = k.stride(1);
+    params.v_head_stride_in_elts = v.stride(1);
+    params.o_ptr = o_packed_d;
+    params.o_row_stride_in_elts = h * d;
+    params.o_head_stride_in_elts = d;
+    params.o_tmp_ptr = o_tmp_d;
+
+    params.cu_seqlens_q = static_cast<int *>(cu_seqlens_q_d);
+    params.cu_seqlens_k = static_cast<int *>(cu_seqlens_k_d);
+
+    // S = softmax(P)
+    params.s_ptr = s_d;
+    params.s_stride_in_bytes = b * h * seqlen_k * 2;  // 2 = sizeof(Element)
+
+    // Softmax sum
+    params.softmax_lse_ptr = softmax_lse_d;
+
+    // Set the dimensions.
+    params.b = b;
+    params.h = h;
+    params.seqlen_q = seqlen_q;
+    params.seqlen_k = seqlen_k;
+    params.d = d;
+
+    // Set the different scale values.
+    params.scale_bmm1 = softmax_scale;
+
+    // Set this to probability of keeping an element to simplify things.
+    params.p_dropout = 1.f - p_dropout;
+    // Convert p from float to int so we don't have to convert the random uint to float to compare.
+    // [Minor] We want to round down since when we do the comparison we use <= instead of <
+    params.p_dropout_in_uint = uint32_t(std::floor(params.p_dropout * 4294967295.0));
+    params.p_dropout_in_uint16_t = uint16_t(std::floor(params.p_dropout * 65535.0));
+    params.rp_dropout = 1.f / params.p_dropout;
+    params.scale_bmm1_rp_dropout = params.rp_dropout * params.scale_bmm1;
+    TORCH_CHECK(p_dropout < 1.f);
+
+    params.is_causal = is_causal;
+}
+
+std::vector<at::Tensor>
+mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+        const at::Tensor &k,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+        const at::Tensor &v,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+        const at::Tensor &cu_seqlens_q,  // b+1
+        const at::Tensor &cu_seqlens_k,  // b+1
+        const int max_seqlen_q_,
+        const int max_seqlen_k_,
+        const float p_dropout,
+        const float softmax_scale,
+        const bool zero_tensors,
+        const bool is_causal,
+        const bool return_softmax,
+        c10::optional<at::Generator> gen_) {
+
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
+    bool is_sm80 = dprops->major == 8 && dprops->minor == 0;
+    bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
+    TORCH_CHECK(is_sm8x || is_sm75);
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    bool is_dropout = p_dropout > 0.0;
+    Launch_params<FMHA_fprop_params> launch_params(dprops, stream, is_dropout, return_softmax);
+
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == at::kHalf || (is_sm8x && q_dtype == at::kBFloat16));
+    TORCH_CHECK(k.dtype() == q_dtype);
+    TORCH_CHECK(v.dtype() == q_dtype);
+    TORCH_CHECK(cu_seqlens_q.dtype() == at::kInt);
+    TORCH_CHECK(cu_seqlens_k.dtype() == at::kInt);
+
+    TORCH_CHECK(q.is_cuda());
+    TORCH_CHECK(k.is_cuda());
+    TORCH_CHECK(v.is_cuda());
+    TORCH_CHECK(cu_seqlens_q.is_cuda());
+    TORCH_CHECK(cu_seqlens_k.is_cuda());
+
+    TORCH_CHECK(q.stride(-1) == 1);
+    TORCH_CHECK(k.stride(-1) == 1);
+    TORCH_CHECK(v.stride(-1) == 1);
+    TORCH_CHECK(cu_seqlens_k.is_contiguous());
+    TORCH_CHECK(cu_seqlens_k.is_contiguous());
+
+    const auto sizes = q.sizes();
+
+    const int batch_size = cu_seqlens_q.numel() - 1;
+    const int total_q = sizes[TOTAL_DIM];
+    const int num_heads = sizes[H_DIM];
+    const int head_size = sizes[D_DIM];
+    const int total_k = k.size(TOTAL_DIM);
+    TORCH_CHECK(batch_size > 0);
+    TORCH_CHECK((head_size % 8 == 0) && (head_size <= 128));
+    const int head_size_rounded = head_size <= 64 ? 64 : 128;
+
+    CHECK_SHAPE(q, total_q, num_heads, head_size);
+    CHECK_SHAPE(k, total_k, num_heads, head_size);
+    CHECK_SHAPE(v, total_k, num_heads, head_size);
+    CHECK_SHAPE(cu_seqlens_q, batch_size + 1);
+    CHECK_SHAPE(cu_seqlens_k, batch_size + 1);
+
+    int blocksize_c = ((head_size_rounded == 128 && (is_dropout || !is_sm80)) || (is_sm75 && head_size_rounded == 64 && is_dropout)) ? 128 : 256;
+    // Need to round max_seqlen_k to multiples of blocksize_c
+    int max_seqlen_k = ((max_seqlen_k_ + blocksize_c - 1) / blocksize_c) * blocksize_c;
+    if( max_seqlen_k_ <= 128 ) {
+        max_seqlen_k = 128;
+    } else if( max_seqlen_k_ <= 256 ) {
+        max_seqlen_k = 256;
+    }
+    int max_seqlen_q = ((max_seqlen_q_ + 16 - 1) / 16) * 16;
+    bool loop = max_seqlen_k > blocksize_c;
+
+    auto opts = q.options();
+
+    auto o = at::empty({ total_q, num_heads, head_size }, opts);
+
+    at::Tensor o_tmp;
+    if (loop) { o_tmp = at::empty({total_q, num_heads, head_size}, opts.dtype(at::kFloat)); }
+
+    auto softmax_lse = at::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat));
+    // auto softmax_lse = torch::full({batch_size, num_heads, max_seqlen_k}, -std::numeric_limits<float>::infinity(), opts.dtype(at::kFloat));
+
+    at::Tensor s;
+    if (return_softmax) { s = at::empty({ batch_size, num_heads, max_seqlen_q, max_seqlen_k }, opts); }
+
+    if( zero_tensors ) {
+        o.zero_();
+        softmax_lse.fill_(-std::numeric_limits<float>::infinity());
+        if (return_softmax) {s.zero_();}
+    }
+
+    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+        gen_, at::cuda::detail::getDefaultCUDAGenerator());
+
+    set_params_fprop(launch_params.params,
+                     batch_size,
+                     max_seqlen_q,
+                     max_seqlen_k,
+                     num_heads,
+                     head_size,
+                     q, k, v,
+                     cu_seqlens_q.data_ptr(),
+                     cu_seqlens_k.data_ptr(),
+                     o.data_ptr(),
+                     loop ? o_tmp.data_ptr() : nullptr,
+                     return_softmax ? s.data_ptr() : nullptr,
+                     softmax_lse.data_ptr(),
+                     p_dropout,
+                     softmax_scale,
+                     is_causal);
+
+    run_fmha_fprop(launch_params, /*configure=*/ true);
+    // number of times random will be generated per thread, to offset philox counter in thc random
+    // state
+    int64_t counter_offset = launch_params.elts_per_thread;
+
+    if( is_dropout ) {
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(gen->mutex_);
+        launch_params.params.philox_args = gen->philox_cuda_state(counter_offset);
+    }
+
+    run_fmha_fprop(launch_params, /*configure=*/false);
+
+    std::vector<at::Tensor> result = {o, softmax_lse};
+    if (return_softmax) {result.push_back(s);}
+    return result;
+}
+} // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.h b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.h
new file mode 100644
index 000000000000..3dca7e2ac89d
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.h
@@ -0,0 +1,24 @@
+#pragma once
+#include <cstddef>
+
+#include <ATen/ATen.h>
+#include <c10/util/Exception.h>
+
+namespace fmha {
+
+std::vector<at::Tensor>
+mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+        const at::Tensor &k,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+        const at::Tensor &v,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+        const at::Tensor &cu_seqlens_q,  // b+1
+        const at::Tensor &cu_seqlens_k,  // b+1
+        const int max_seqlen_q_,
+        const int max_seqlen_k_,
+        const float p_dropout,
+        const float softmax_scale,
+        const bool zero_tensors,
+        const bool is_causal,
+        const bool return_softmax,
+        c10::optional<at::Generator> gen_);
+
+} // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fprop_kernel_1xN.h b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fprop_kernel_1xN.h
new file mode 100644
index 000000000000..c4fe18802463
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fprop_kernel_1xN.h
@@ -0,0 +1,722 @@
+/***************************************************************************************************
+ * Copyright (c) 2022, Tri Dao.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/native/transformers/cuda/flash_attn/philox.cuh>
+#include <ATen/native/transformers/cuda/flash_attn/mask.h>
+#include <ATen/native/transformers/cuda/flash_attn/fmha_kernel.h>
+#include <ATen/native/transformers/cuda/flash_attn/softmax.h>
+#include <ATen/native/transformers/cuda/flash_attn/epilogue.h>
+
+#include <cutlass/cutlass.h>
+#include <cutlass/layout/layout.h>
+#include <cutlass/array.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/arch/mma.h>
+#include <cutlass/gemm/warp/default_mma_tensor_op.h>
+#include <cutlass/gemm/warp/mma_tensor_op_tile_iterator.h>
+#include <cutlass/gemm/threadblock/default_mma_core.h>
+#include <cutlass/gemm/threadblock/default_mma_core_sm75.h>
+#include <cutlass/gemm/threadblock/default_mma_core_sm80.h>
+#include <cutlass/epilogue/warp/fragment_iterator_tensor_op.h>
+#include <cutlass/epilogue/warp/tile_iterator_tensor_op.h>
+#include <cutlass/epilogue/threadblock/default_thread_map_tensor_op.h>
+#include <cutlass/epilogue/threadblock/default_epilogue_tensor_op.h>
+#include <cutlass/epilogue/threadblock/predicated_tile_iterator.h>
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Kernel_traits>
+struct Gemm_Q_K_base {
+    using Smem_O = fmha::FMHAEpilogue<typename Kernel_traits::MmaCorePV>;
+    using WarpMma = typename Kernel_traits::MmaCoreQK::MmaTensorOp;
+
+    // The description of the CTA tile for the 1st batched GEMM.
+    using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+
+    static constexpr size_t SMEM_BYTES_SOFTMAX = Cta_tile_p::M * Cta_tile_p::WARPS_N * sizeof(float) * 2;
+
+    __device__ inline Gemm_Q_K_base(char * smem_ptr_q, char * smem_ptr_k)
+        : smem_q_ptr(smem_ptr_q)
+        , smem_k_ptr(smem_ptr_k) {
+
+    }
+
+    __device__ inline void load_q(int byte_offset=0) {
+        typename WarpMma::LayoutA layout_A = WarpMma::LayoutA::packed({Cta_tile_p::M, Cta_tile_p::K});
+        typename WarpMma::IteratorA iter_A({reinterpret_cast<typename WarpMma::ElementA *>(smem_q_ptr + byte_offset), layout_A}, threadIdx.x % 32);
+        iter_A.load(frag_q[0]);
+    }
+
+
+    __device__ inline void reload_q(int byte_offset=0) {
+        typename WarpMma::LayoutA layout_A = WarpMma::LayoutA::packed({Cta_tile_p::M, Cta_tile_p::K});
+        typename WarpMma::IteratorA iter_A({reinterpret_cast<typename WarpMma::ElementA *>(smem_q_ptr + byte_offset), layout_A}, threadIdx.x % 32);
+        iter_A.load(frag_q[0]);
+    }
+
+    typename WarpMma::FragmentA frag_q[2];
+    char *smem_q_ptr;
+    char *smem_k_ptr;
+};
+
+template<typename Kernel_traits, bool K_in_regs>
+struct Gemm_Q_K : public Gemm_Q_K_base<Kernel_traits> {
+
+    using Base = Gemm_Q_K_base<Kernel_traits>;
+    using Cta_tile_p = typename Base::Cta_tile_p;
+    using Smem_O = typename Base::Smem_O;
+    using WarpMma = typename Base::WarpMma;
+
+    static constexpr int kIterations = WarpMma::Shape::kK / WarpMma::InstructionShape::kK;
+
+    static constexpr bool SHARE_SMEM_FOR_K_AND_V = Kernel_traits::SHARE_SMEM_FOR_K_AND_V;
+    // If V is stored in shared memory, we can't load K using the same shared memory.
+    static_assert(Kernel_traits::V_IN_REGS);
+
+    static constexpr size_t SMEM_OFFSET_O = Kernel_traits::BYTES_PER_SMEM_Q;
+    static constexpr size_t SMEM_OFFSET_SOFTMAX = SMEM_OFFSET_O + sizeof(typename Smem_O::SharedStorage);
+    static constexpr size_t SMEM_OFFSET_V = Kernel_traits::BYTES_PER_SMEM_Q + (SHARE_SMEM_FOR_K_AND_V ? 0 : Kernel_traits::BYTES_PER_SMEM_K);
+
+    // Q | K / V
+    //   | O | SOFTMAX
+    static constexpr size_t SMEM_BYTES = Kernel_traits::BYTES_PER_SMEM_Q
+        + std::max((size_t)(SHARE_SMEM_FOR_K_AND_V ? 1 : 2) * Kernel_traits::BYTES_PER_SMEM_K,
+                   sizeof(typename Smem_O::SharedStorage) + Base::SMEM_BYTES_SOFTMAX);
+
+    __device__ inline Gemm_Q_K(char * smem_)
+        : Base(smem_, smem_ + Kernel_traits::BYTES_PER_SMEM_Q) {
+    }
+
+    __device__ inline void load_k(){
+        typename WarpMma::LayoutB layout_B = WarpMma::LayoutB::packed({Cta_tile_p::K, Cta_tile_p::N});
+        typename WarpMma::IteratorB iter_B({reinterpret_cast<typename WarpMma::ElementB *>(Base::smem_k_ptr), layout_B}, threadIdx.x % 32);
+        const int warp_idx = threadIdx.x / 32;
+        iter_B.add_tile_offset({0, warp_idx});
+        #pragma unroll
+        for( int ki = 0; ki < kIterations; ++ki ) {
+            iter_B.load(frag_k[ki]);
+            ++iter_B;
+        }
+    }
+
+    __device__ inline void operator()(WarpMma warp_mma, typename WarpMma::FragmentC &acc_p, int byte_offset_q=0){
+        typename WarpMma::LayoutA layout_A = WarpMma::LayoutA::packed({Base::Cta_tile_p::M, Base::Cta_tile_p::K});
+        typename WarpMma::IteratorA iter_A({reinterpret_cast<typename WarpMma::ElementB *>(Base::smem_q_ptr + byte_offset_q), layout_A}, threadIdx.x % 32);
+        ++iter_A;
+        // Do this part of P^T = (Q * K^T)^T.
+        #pragma unroll
+        for( int ki = 0; ki < kIterations; ++ki ) {
+            // Trigger the load from shared memory for the next series of Q values.
+            if (ki + 1 < kIterations) { iter_A.load(Base::frag_q[(ki + 1) % 2]); ++iter_A; }
+            // Do the math for the values already in registers.
+            warp_mma(acc_p, Base::frag_q[ki % 2], frag_k[ki], acc_p);
+        }
+    }
+
+    __device__ inline void reload_k(){
+        // Noop.
+    }
+
+    typename WarpMma::FragmentB frag_k[kIterations];
+};
+
+
+template<typename Kernel_traits>
+struct Gemm_Q_K<Kernel_traits, false> : public Gemm_Q_K_base<Kernel_traits> {
+    using Base = Gemm_Q_K_base<Kernel_traits>;
+    using Cta_tile_p = typename Base::Cta_tile_p;
+    using Smem_O = typename Base::Smem_O;
+    using WarpMma = typename Base::WarpMma;
+
+    static constexpr bool SHARE_SMEM_FOR_K_AND_V = Kernel_traits::SHARE_SMEM_FOR_K_AND_V;
+    static constexpr bool V_IN_REGS = Kernel_traits::V_IN_REGS;
+    static_assert(V_IN_REGS || !SHARE_SMEM_FOR_K_AND_V);
+
+    static constexpr size_t SMEM_OFFSET_V = Kernel_traits::BYTES_PER_SMEM_Q + (SHARE_SMEM_FOR_K_AND_V ? 0 : Kernel_traits::BYTES_PER_SMEM_K);
+    static constexpr size_t SMEM_OFFSET_O = SMEM_OFFSET_V + Kernel_traits::BYTES_PER_SMEM_V;
+    static constexpr size_t SMEM_OFFSET_SOFTMAX = SMEM_OFFSET_O + sizeof(typename Smem_O::SharedStorage);
+
+    // If V_IN_REGS and SHARE_SMEM_FOR_K_AND_V:      Q | K/V | O | SOFTMAX
+    // If !V_IN_REGS (then !SHARE_SMEM_FOR_K_AND_V): Q | K   | V | O | SOFTMAX
+    static constexpr size_t SMEM_BYTES = Kernel_traits::BYTES_PER_SMEM_Q
+        + (SHARE_SMEM_FOR_K_AND_V ? 1 : 2) * Kernel_traits::BYTES_PER_SMEM_K
+        + sizeof(typename Smem_O::SharedStorage) + Base::SMEM_BYTES_SOFTMAX;
+
+    __device__ inline Gemm_Q_K(char * smem_)
+        : Base(smem_, smem_ + Kernel_traits::BYTES_PER_SMEM_Q) {
+    }
+
+    __device__ inline void load_k(){
+        typename WarpMma::LayoutB layout_B = WarpMma::LayoutB::packed({Cta_tile_p::K, Cta_tile_p::N});
+        typename WarpMma::IteratorB iter_B({reinterpret_cast<typename WarpMma::ElementB *>(Base::smem_k_ptr), layout_B}, threadIdx.x % 32);
+        const int warp_idx = threadIdx.x / 32;
+        iter_B.add_tile_offset({0, warp_idx});
+        iter_B.load(frag_k[0]);
+    }
+
+    __device__ inline void operator()(WarpMma warp_mma, typename WarpMma::FragmentC &acc_p, int byte_offset_q=0){
+        typename WarpMma::LayoutA layout_A = WarpMma::LayoutA::packed({Base::Cta_tile_p::M, Base::Cta_tile_p::K});
+        typename WarpMma::IteratorA iter_A({reinterpret_cast<typename WarpMma::ElementA *>(Base::smem_q_ptr + byte_offset_q), layout_A}, threadIdx.x % 32);
+        ++iter_A;
+        typename WarpMma::LayoutB layout_B = WarpMma::LayoutB::packed({Cta_tile_p::K, Cta_tile_p::N});
+        typename WarpMma::IteratorB iter_B({reinterpret_cast<typename WarpMma::ElementB *>(Base::smem_k_ptr), layout_B}, threadIdx.x % 32);
+        const int warp_idx = threadIdx.x / 32;
+        iter_B.add_tile_offset({0, warp_idx});
+        ++iter_B;
+
+        // Do this part of P^T = (Q * K^T)^T.
+        constexpr int kIterations = WarpMma::Shape::kK / WarpMma::InstructionShape::kK;
+        #pragma unroll
+        for( int ki = 0; ki < kIterations; ++ki ) {
+            // Trigger the load from shared memory for the next series of Q values.
+            if (ki + 1 < kIterations) {
+                iter_A.load(Base::frag_q[(ki + 1) % 2]); ++iter_A;
+                iter_B.load(frag_k[(ki + 1) % 2]); ++iter_B;
+            }
+            // Do the math for the values already in registers.
+            warp_mma(acc_p, Base::frag_q[ki % 2], frag_k[ki % 2], acc_p);
+        }
+    }
+    __device__ inline void reload_k(){
+        typename WarpMma::LayoutB layout_B = WarpMma::LayoutB::packed({Cta_tile_p::K, Cta_tile_p::N});
+        typename WarpMma::IteratorB iter_B({reinterpret_cast<typename WarpMma::ElementB *>(Base::smem_k_ptr), layout_B}, threadIdx.x % 32);
+        const int warp_idx = threadIdx.x / 32;
+        iter_B.add_tile_offset({0, warp_idx});
+        iter_B.load(frag_k[0]);
+    }
+
+    typename WarpMma::FragmentB frag_k[2];
+};
+
+template<typename Kernel_traits>
+constexpr size_t get_dynamic_smem_size(){
+    return Gemm_Q_K<Kernel_traits, Kernel_traits::K_IN_REGS>::SMEM_BYTES;
+}
+
+template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Return_softmax, bool Is_first, bool Is_last, typename Params, typename Prng>
+inline __device__ void device_1xN_(const Params &params, const int bidb, const int bidh, int begin, int steps, Prng &ph0, Prng &ph1, const int loop_step_idx) {
+
+    // The description of the CTA tile for the 1st batched GEMM.
+    using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+    // The description of the CTA tile for the 2nd batched GEMM.
+    using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
+
+    // The MMA tile for the 1st GEMM.
+    using Mma_tile_p = fmha::Hmma_tile<Cta_tile_p>;
+    // The MMA tile for the 2nd GEMM.
+    using Mma_tile_o = fmha::Hmma_tile<Cta_tile_o>;
+
+    using InstructionShape = typename Kernel_traits::MmaInstructionShape;
+    using Element = typename Kernel_traits::Element;
+    using ElementAccum = typename Kernel_traits::ElementAccum;
+
+    using ThreadblockShapeQK = typename Kernel_traits::ThreadblockShapeQK;
+    using LayoutQ = typename Kernel_traits::LayoutQ;
+    using LayoutK = typename Kernel_traits::LayoutK;
+    using LayoutP = typename Kernel_traits::LayoutP;
+    using MmaCoreQK = typename Kernel_traits::MmaCoreQK;
+    using WarpMmaQK = typename MmaCoreQK::MmaTensorOp;
+    using SmemLayoutQ = typename MmaCoreQK::SmemLayoutA;
+    using SmemLayoutK = typename MmaCoreQK::SmemLayoutB;
+    using SmemIteratorQ = typename MmaCoreQK::SmemIteratorA;
+    using SmemIteratorK = typename MmaCoreQK::SmemIteratorB;
+
+    using ThreadblockShapePV = typename Kernel_traits::ThreadblockShapePV;
+    using LayoutV = typename Kernel_traits::LayoutV;
+    using LayoutO = typename Kernel_traits::LayoutO;
+    using MmaCorePV = typename Kernel_traits::MmaCorePV;
+    using WarpMmaPV = typename MmaCorePV::MmaTensorOp;
+    using WarpIteratorV = typename WarpMmaPV::IteratorB;
+    using SmemLayoutV = typename MmaCorePV::SmemLayoutB;
+    using SmemIteratorV = typename MmaCorePV::SmemIteratorB;
+    constexpr int kIterationsPV = WarpMmaPV::Shape::kK / WarpMmaPV::InstructionShape::kK;
+
+    // The global memory tile to load Q.
+    // Copy from mma_piplined_testbed.h
+    using GmemIteratorQ = typename Kernel_traits::GmemIteratorQ;
+    // The global memory tile to load K.
+    using GmemIteratorK = typename Kernel_traits::GmemIteratorK;
+    // The global memory tile to load V.
+    using GmemIteratorV = typename Kernel_traits::GmemIteratorV;
+    // The global memory tile to store O.
+    using GmemIteratorO = typename fmha::FMHAEpilogue<MmaCorePV>::GmemIterator;
+    using GmemIteratorOAccum = typename fmha::FMHAEpilogue<MmaCorePV>::GmemIteratorAccum;
+
+    using Gmem_tile_s = typename Kernel_traits::Gmem_tile_s;
+
+    using Gmem_softmax_sum = typename Kernel_traits::Gmem_softmax_sum;
+
+    using Smem_softmax_lse = typename Kernel_traits::Smem_softmax_lse;
+
+    using Gemm1 = Gemm_Q_K<Kernel_traits, Kernel_traits::K_IN_REGS>;
+
+    using Softmax = fmha::Softmax<Cta_tile_p, Kernel_traits>;
+
+    // Shared memory.
+    extern __shared__ char smem_[];
+
+    // The thread index.
+    const int tidx = threadIdx.x;
+
+    const BlockInfoPadded<Kernel_traits::THREADS> binfo(params, bidb, bidh, tidx);
+    if( binfo.stop_early(loop_step_idx * Cta_tile_p::N) ) return;
+
+    Gemm1 gemm_q_k(smem_);
+    // Allocate the global memory tile loader for S.
+    Gmem_tile_s gmem_s(params, binfo, tidx);
+    Gmem_softmax_sum gmem_softmax_lse(params.softmax_lse_ptr, params, tidx);
+
+    // Wind gmem tiles to the correct position.
+    static_assert(Cta_tile_p::N % Cta_tile_p::M == 0);
+    const int begin_og = begin;
+    begin = Is_causal ? std::max(begin, loop_step_idx * Cta_tile_p::N / Cta_tile_p::M) : begin;
+    const int steps_og = steps;
+    steps -= begin - begin_og;
+    if (Return_softmax) { gmem_s.move(begin); }
+    gmem_softmax_lse.move(begin);
+
+    fmha::Mask<Cta_tile_p, Is_causal> mask(binfo, tidx, loop_step_idx);
+
+    // The base pointer of smem_v;
+    char *smem_v_addr = &smem_[Gemm1::SMEM_OFFSET_V];
+
+    // Allocate the shared memory tile loader for V. We use the same as K so be careful!!!
+
+    SmemLayoutQ layout_Q = SmemLayoutQ::packed({ThreadblockShapeQK::kM, ThreadblockShapeQK::kK});
+    SmemIteratorQ smem_q({reinterpret_cast<Element *>(smem_), layout_Q}, tidx);
+    SmemLayoutK layout_K = SmemLayoutK::packed({ThreadblockShapeQK::kK, ThreadblockShapeQK::kN});
+    SmemIteratorK smem_k({reinterpret_cast<Element *>(smem_ + Kernel_traits::BYTES_PER_SMEM_Q), layout_K}, tidx);
+    SmemLayoutV layout_V = SmemLayoutV::packed({ThreadblockShapePV::kK, ThreadblockShapePV::kN});
+    // SmemIterator stores to smem and WarpIterator loads from smem
+    SmemIteratorV smem_v({reinterpret_cast<Element *>(smem_v_addr), layout_V}, tidx);
+    WarpIteratorV iter_V({reinterpret_cast<Element *>(smem_v_addr), layout_V}, threadIdx.x % 32);
+
+    // Allocate the shared memory tile loader for O. We use the same as K so be careful!!!
+    using Smem_O = fmha::FMHAEpilogue<MmaCorePV>;
+    Smem_O smem_o(&smem_[Gemm1::SMEM_OFFSET_O], tidx);
+
+    // Allocate the global memory tile loader for Q.
+    // cutlass::transform::threadblock::PredicatedTileIterator deals with seqlen not divisible
+    // by 16 in a different way than we want. If the seqlen_q is 36, the first iteration would
+    // load 4 rows and the next two iterations would load 16 rows each. Instead we round the
+    // actual_seqlen_q to be multiple of 16, then change the mask in the last iteration, so
+    // that in this case we would load 16, 16, 4.
+    LayoutQ gmem_layout_Q(params.q_row_stride_in_elts);
+    typename GmemIteratorQ::Params gmem_Q_params(gmem_layout_Q);
+    const uint32_t row_offset_q = (binfo.sum_s_q + begin * ThreadblockShapeQK::kM) * params.q_row_stride_in_elts + binfo.bidh * params.q_head_stride_in_elts;
+    const int actual_seqlen_q = binfo.actual_seqlen_q - begin * ThreadblockShapeQK::kM;
+    const int seqlen_q_remainder = actual_seqlen_q % ThreadblockShapeQK::kM;
+    const int extent_q = ((actual_seqlen_q <= ThreadblockShapeQK::kM) || (seqlen_q_remainder == 0)) ? actual_seqlen_q : actual_seqlen_q + ThreadblockShapeQK::kM - seqlen_q_remainder;
+    GmemIteratorQ gmem_q(gmem_Q_params,
+                         reinterpret_cast<Element *>(params.q_ptr) + row_offset_q,
+                         {extent_q, params.d},
+                         tidx);
+
+    // Allocate the global memory tile loader for K.
+    LayoutK gmem_layout_K(params.k_row_stride_in_elts);
+    typename GmemIteratorK::Params gmem_K_params(gmem_layout_K);
+    const uint32_t row_offset_k = (binfo.sum_s_k + loop_step_idx * ThreadblockShapeQK::kN) * params.k_row_stride_in_elts + binfo.bidh * params.k_head_stride_in_elts;
+    const int extent_k = min(binfo.actual_seqlen_k - loop_step_idx * ThreadblockShapeQK::kN, ThreadblockShapeQK::kN);
+    GmemIteratorK gmem_k(gmem_K_params,
+                         reinterpret_cast<Element *>(params.k_ptr) + row_offset_k,
+                         {params.d, extent_k},
+                         tidx);
+
+    // Allocate the global memory tile loader for V.
+    LayoutV gmem_layout_V(params.v_row_stride_in_elts);
+    typename GmemIteratorV::Params gmem_V_params(gmem_layout_V);
+    const uint32_t row_offset_v = (binfo.sum_s_k + loop_step_idx * ThreadblockShapePV::kK) * params.v_row_stride_in_elts + binfo.bidh * params.v_head_stride_in_elts;
+    // extent_v is the same as extent_k
+    GmemIteratorV gmem_v(gmem_V_params,
+                         reinterpret_cast<Element *>(params.v_ptr) + row_offset_v,
+                         {extent_k, params.d},
+                         tidx);
+
+    // Allocate the global memory tile loader for O.
+    LayoutO gmem_layout_O(params.o_row_stride_in_elts);
+    typename GmemIteratorO::Params gmem_O_params(gmem_layout_O);
+    const uint32_t row_offset_o = (binfo.sum_s_q + begin * ThreadblockShapeQK::kM) * params.o_row_stride_in_elts + binfo.bidh * params.o_head_stride_in_elts;
+    GmemIteratorO gmem_o(gmem_O_params,
+                         reinterpret_cast<Element *>(params.o_ptr) + row_offset_o,
+                         {actual_seqlen_q, params.d},
+                         tidx);
+
+    typename GmemIteratorOAccum::Params gmem_Oaccum_params(gmem_layout_O);
+    GmemIteratorOAccum gmem_o_accum(gmem_Oaccum_params,
+                                    reinterpret_cast<ElementAccum *>(params.o_tmp_ptr) + row_offset_o,
+                                    {actual_seqlen_q, params.d},
+                                    tidx);
+
+    // Create the object to do the softmax.
+    Softmax softmax(params, &smem_[Gemm1::SMEM_OFFSET_SOFTMAX], tidx);
+
+    Smem_softmax_lse smem_softmax_lse(reinterpret_cast<float *>(&smem_[Gemm1::SMEM_BYTES]));
+
+    if (!Is_first) {
+        if (Return_softmax) { gmem_s.move(loop_step_idx * steps_og); }
+    }
+
+    if (!Is_first) { __syncthreads(); }
+
+    // Trigger the loads for V.
+    typename GmemIteratorV::Fragment gmem_frag_v;
+    gmem_frag_v.clear();
+    gmem_v.load(gmem_frag_v);
+
+    // Trigger the loads for Q.
+    typename GmemIteratorQ::Fragment gmem_frag_q;
+    gmem_frag_q.clear();
+    gmem_q.load(gmem_frag_q);
+
+    // Trigger the loads for K.
+    typename GmemIteratorK::Fragment gmem_frag_k;
+    gmem_frag_k.clear();
+    gmem_k.load(gmem_frag_k);
+
+    float p_prev_lse[Mma_tile_p::MMAS_M * 2];
+    if (!Is_first) {
+        gmem_softmax_lse.load(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));
+    }
+
+    // Commit the data for Q and V to shared memory.
+    smem_v.store(gmem_frag_v);
+    smem_q.store(gmem_frag_q);
+
+    // Commit the data for K to shared memory.
+    if( !Kernel_traits::SHARE_SMEM_FOR_K_AND_V ) {
+        smem_k.store(gmem_frag_k);
+    }
+
+    __syncthreads();
+
+    // Load the fragments for Q.
+    gemm_q_k.load_q();
+
+    // Load the fragments for V. We keep the data in registers during the entire
+    // kernel. copied from mma_pipelined.h
+    const int warp_idx = threadIdx.x / 32;
+    iter_V.add_tile_offset({kIterationsPV * warp_idx, 0});
+    typename WarpIteratorV::Fragment frag_v[kIterationsPV];
+    static_assert(WarpIteratorV::Fragment::kStorageElements == 4 * Mma_tile_o::MMAS_N || WarpIteratorV::Fragment::kStorageElements == 2 * Mma_tile_o::MMAS_N );
+    #pragma unroll
+    for( int ki = 0; ki < kIterationsPV; ++ki ) {
+        iter_V.load(frag_v[ki]);
+        ++iter_V;
+    }
+
+    // Commit the data for K to shared memory if it has not been done already.
+    if( Kernel_traits::SHARE_SMEM_FOR_K_AND_V ) {
+        // Make sure we are done loading the fragments for K.
+        __syncthreads();
+
+        // Commit the data to shared memory for K.
+        smem_k.store(gmem_frag_k);
+
+        // Make sure the data is in shared memory.
+        __syncthreads();
+    }
+
+    // Load the fragments for K.
+    gemm_q_k.load_k();
+
+    // Load over the entire sequence length.
+    for( int l = 0; l < steps; l++ ) {
+        if((begin + l) * Cta_tile_p::M >= binfo.actual_seqlen_q) break;
+
+        // Declare the accumulators for the 1st gemm.
+        WarpMmaQK mma_qk;
+        typename WarpMmaQK::FragmentC acc_p;
+        acc_p.clear();
+
+        // Do this part of P = Q * K^T.
+        gemm_q_k(mma_qk, acc_p);
+
+        typename Smem_O::OutputFragment out[Smem_O::kIterationsStore];
+        static_assert(GmemIteratorOAccum::kIterations == Smem_O::kIterationsStore);
+        static_assert(GmemIteratorO::kIterations == Smem_O::kIterationsStore);
+        if (!Is_first) {
+            #pragma unroll
+            for (int iter = 0; iter < GmemIteratorOAccum::kIterations; ++iter) {
+                gmem_o_accum.load(out[iter]);
+                gmem_o_accum.move();
+            }
+        }
+
+        // Trigger the load for the next Q values.
+        if( l < steps - 1) {
+            ++gmem_q;
+            // If actual_seqlen_q is not a multiple of 16, we change the mask in the last iteration
+            // to load the "residue" tile.
+            if ((l + 1 == steps - 1) && (actual_seqlen_q % ThreadblockShapeQK::kM != 0)) {
+                // TODO: this probably only works for head_dim = 64 and head_dim = 128, which is
+                // what we have right now. Maybe for head_dim = 32 or 96, this could be different.
+                const int row_idx = tidx / (GmemIteratorQ::Shape::kColumn / GmemIteratorQ::Fragment::kElements);
+                if (row_idx >= actual_seqlen_q - (l + 1) * ThreadblockShapeQK::kM) {
+                    gmem_q.clear_mask();
+                }
+            }
+            gmem_q.load(gmem_frag_q);
+        }
+
+        // Load the mask for that iteration.
+        mask.load(begin + l);
+
+        // Convert from the accumulator type to FP32 for Softmax.
+        softmax.unpack_noscale(acc_p);
+
+        // Apply the mask.
+        softmax.apply_mask(mask);
+
+        if( Kernel_traits::SHARE_SMEM_FOR_K_AND_V && l == 0 ) {
+            // if we share K and V, it could be that V was not fully read yet but we write into smem for reduction
+            __syncthreads();
+        }
+
+        // Compute the max.
+        float p_max[Mma_tile_p::MMAS_M * 2];
+        if (!Is_first) {
+            smem_softmax_lse.store_pair(p_prev_lse);
+            for (int mi = 0; mi < Mma_tile_p::MMAS_M * 2; mi++) { p_max[mi] = p_prev_lse[mi] / params.scale_bmm1; }
+        }
+
+        // Trigger the load for the next LSE values.
+        if( l < steps - 1) {
+            if (!Is_first) {
+                gmem_softmax_lse.load_next(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));
+            }
+        }
+
+        softmax.template reduce_max</*zero_init=*/Is_first>(p_max);
+
+        // Compute the exponential value.
+        softmax.scale_apply_exp(p_max, params.scale_bmm1);
+
+        // We don't finalize the sum reduction here, as that would incur an extra sync_threads().
+        // Instead, we reduce the sum from each warp, write to smem, then wait until the sync_threads()
+        // from storing acc_o. Then we read the sum of each warp from smem and finalize the reduction.
+        // As a consequence, we don't scale acc_p by the inverse sum, we scale the output by the inverse sum.
+        // Compute the sum.
+        float p_sum[Mma_tile_p::MMAS_M * 2];
+        // softmax.reduce_sum(p_sum);
+        softmax.reduce_sum_before_sync_(p_sum);
+
+        constexpr bool encode_dropout_in_sign_bit = Return_softmax;
+        if (Is_dropout) {
+            softmax.template apply_dropout_16bits<encode_dropout_in_sign_bit>(ph0, ph1, params.p_dropout_in_uint16_t);
+        }
+
+        static_assert(Mma_tile_o::MMAS_M == Mma_tile_p::MMAS_M);
+        static_assert(Mma_tile_o::MMAS_K == Mma_tile_p::MMAS_N);
+        softmax.pack_noconvert(acc_p);
+        cutlass::NumericArrayConverter<Element, ElementAccum, decltype(acc_p)::kElements, cutlass::FloatRoundStyle::round_to_nearest> convert_p;
+        auto frag_p = convert_p(acc_p);
+
+        if (Return_softmax) {
+            gmem_s.store(reinterpret_cast<const cutlass::Array<Element, 8>(&)[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M]>(frag_p), mask);
+            gmem_s.move();
+        }
+
+        // Commit the values for Q into shared memory.
+        if (l < steps - 1) { smem_q.store(gmem_frag_q); }
+
+        if (Is_dropout && encode_dropout_in_sign_bit) {
+            cutlass::epilogue::thread::ReLu<decltype(frag_p)> relu;
+            frag_p = relu(frag_p);
+        }
+
+        // Declare the accumulators for the 2nd gemm.
+        WarpMmaPV mma_pv;
+        typename WarpMmaPV::FragmentC acc_o;
+        static_assert(WarpMmaPV::FragmentC::kElements == Mma_tile_o::MMAS_M * Mma_tile_o::MMAS_N * 8);
+        acc_o.clear();
+
+        // For some reason, WarpMmaPV::FragmentA has length K * N * (8|4) instead of just N * (8|4).
+        // We have to first cast frag_p to be array of k x (N * (8|4)), then cast each row to be
+        // an array of WarpMmaPV::FragmentA (which is what mma_pv expects).
+        static_assert(decltype(frag_p)::kElements == kIterationsPV * Mma_tile_o::MMAS_M * WarpMmaPV::FragmentA::kElements);
+        const auto frag_p_reshaped = reinterpret_cast<const cutlass::Array<Element, WarpMmaPV::FragmentA::kElements> (&)[kIterationsPV]>(frag_p);
+        #pragma unroll
+        for( int ki = 0; ki < kIterationsPV; ++ki ) {
+            mma_pv(acc_o, reinterpret_cast<const typename WarpMmaPV::FragmentA(&)>(frag_p_reshaped[ki]), frag_v[ki], acc_o);
+        }
+        // Swizzle the elements and do the final reduction.
+        smem_o.store(acc_o);
+
+        // The mapping from tidx to rows changes between the softmax and the
+        // O-reduction. So we recalculate the max.
+        using OutputTileThreadMap = typename Smem_O::OutputTileThreadMap;
+        constexpr int kOutputRowsPerThread = OutputTileThreadMap::Iterations::kRow * Smem_O::kIterationsStore;
+        float p_max_o[kOutputRowsPerThread][Mma_tile_o::MMAS_M];
+        int rows[kOutputRowsPerThread];
+        cutlass::MatrixCoord output_thread_offset = OutputTileThreadMap::initial_offset(tidx);
+        const int output_thread_start_row = output_thread_offset.row();
+        const int output_thread_start_column = output_thread_offset.column();
+        for (int iter = 0; iter < Smem_O::kIterationsStore; ++iter) {
+            for (int row = 0; row < OutputTileThreadMap::Iterations::kRow; ++row) {
+                rows[iter * OutputTileThreadMap::Iterations::kRow + row] = output_thread_start_row + iter * OutputTileThreadMap::Shape::kRow + row;
+            }
+        }
+
+        softmax.reduce_max_after_sync_(p_max_o, rows);
+        static_assert(Mma_tile_o::MMAS_M == 1);
+        for (int jj = 0; jj < kOutputRowsPerThread; jj++) {
+            p_max_o[jj][0] *= params.scale_bmm1;
+        }
+        float p_prev_scale_o[kOutputRowsPerThread];
+        if (!Is_first) {
+            smem_softmax_lse.load(p_prev_scale_o, rows);
+        }
+
+        // Make sure the data is in shared memory.
+        __syncthreads();
+
+        static_assert(Mma_tile_o::MMAS_M == 1);
+        float p_sum_o[kOutputRowsPerThread][Mma_tile_o::MMAS_M];
+        softmax.reduce_sum_after_sync_(p_sum_o, rows);
+        if (!Is_first) {
+            for (int jj = 0; jj < kOutputRowsPerThread; jj++) {
+                p_prev_scale_o[jj] = expf(p_prev_scale_o[jj] - p_max_o[jj][0]);
+                p_sum_o[jj][0] += p_prev_scale_o[jj];
+            }
+        }
+
+        float p_sum_log[kOutputRowsPerThread][Mma_tile_o::MMAS_M];
+        #pragma unroll
+        for (int jj = 0; jj < kOutputRowsPerThread; jj++) {
+            float sum = p_sum_o[jj][0];
+            p_sum_log[jj][0] = (sum == 0.f || sum != sum) ? -INFINITY : p_max_o[jj][0] + __logf(sum);
+            if (output_thread_start_column == 0) {
+                gmem_softmax_lse.store_row(
+                    reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M]>(p_sum_log[jj]), rows[jj]);
+            }
+        }
+        gmem_softmax_lse.move();
+
+        // Load from shared memory.
+        using ArrayTypeO = cutlass::Array<ElementAccum, OutputTileThreadMap::kElementsPerAccess>;
+        static_assert(OutputTileThreadMap::kElementsPerAccess * kOutputRowsPerThread == Smem_O::kIterationsStore * Smem_O::OutputFragment::kElements);
+        cutlass::multiplies<ArrayTypeO> multiply_fragments;
+        if (!Is_first) {
+            auto out_reshaped = reinterpret_cast<ArrayTypeO (&)[kOutputRowsPerThread]>(out);
+            for (int jj = 0; jj < kOutputRowsPerThread; jj++) {
+                out_reshaped[jj] = multiply_fragments(out_reshaped[jj], p_prev_scale_o[jj]);
+            }
+        }
+        smem_o.template load</*zero_init=*/Is_first>(out, tidx);
+
+        const bool is_final_write =
+            Is_last
+            || ((loop_step_idx + 1) * Cta_tile_p::N >= binfo.actual_seqlen_k)
+            || ((Is_causal) && ((begin + l) * Cta_tile_p::M < (loop_step_idx + 1) * Cta_tile_p::N));
+        auto out_reshaped = reinterpret_cast<ArrayTypeO (&)[kOutputRowsPerThread]>(out);
+        #pragma unroll
+        for (int jj = 0; jj < kOutputRowsPerThread; jj++) {
+            float sum = p_sum_o[jj][0];
+            float inv_sum = (sum == 0.f || sum != sum) ? 1.f : 1.f / sum;
+            if (Is_dropout && is_final_write) {
+                inv_sum *= params.rp_dropout;
+            }
+            out_reshaped[jj] = multiply_fragments(out_reshaped[jj], inv_sum);
+        }
+
+        // Output the values.
+        if (is_final_write) {
+            typename GmemIteratorO::Fragment out_converted;
+            cutlass::NumericArrayConverter<Element, ElementAccum, decltype(out_converted)::kElements, cutlass::FloatRoundStyle::round_to_nearest> convert_o;
+            #pragma unroll
+            for (int iter = 0; iter < GmemIteratorO::kIterations; ++iter) {
+                out_converted = convert_o(out[iter]);
+                gmem_o.store(out_converted);
+                gmem_o.move();
+            }
+            // We also need to move gmem_o_accum. For example, if Is_causal=true and seqlen=512,
+            // in the first loop, we write the first 256 rows to gmem_o and the last 256 rows to gmem_o_accum.
+            if (Is_first && !Is_last) { gmem_o_accum.move(GmemIteratorOAccum::kIterations); }
+        } else {
+            if (!Is_first) { gmem_o_accum.move(-GmemIteratorOAccum::kIterations); }
+            #pragma unroll
+            for (int iter = 0; iter < GmemIteratorOAccum::kIterations; ++iter) {
+                gmem_o_accum.store(out[iter]);
+                gmem_o_accum.move();
+            }
+        }
+
+        gemm_q_k.reload_k();
+
+        // Trigger the load from shared memory for the next series of Q values.
+        if(l < steps - 1) {
+            gemm_q_k.reload_q();
+        }
+
+    }  // Outer loop over the sequence length.
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Return_softmax, typename Params>
+inline __device__ void device_1xN_loop(const Params &params) {
+
+    // The block index for the batch.
+    const int bidb = blockIdx.x;
+    // The block index for the head.
+    const int bidh = blockIdx.y;
+    // The thread index.
+    const int tidx = threadIdx.x;
+
+    const int tidx_global = (bidb * params.h + bidh) * blockDim.x * 2 + tidx;
+    auto seeds = at::cuda::philox::unpack(params.philox_args);
+    // We use 2 Philox generators to match the dropout pattern in the backward pass.
+    // Forward pass uses 128 threads while backward pass uses 256 threads, so each thread
+    // in the forward pass is simulating the droout pattern of 2 threads in the backward pass.
+    Philox ph0(std::get<0>(seeds), tidx_global, std::get<1>(seeds));
+    Philox ph1(std::get<0>(seeds), tidx_global + blockDim.x, std::get<1>(seeds));
+    constexpr int M = Kernel_traits::Cta_tile_p::M;
+    const int STEPS = (params.seqlen_q + M - 1) / M;
+
+    constexpr int blocksize_c = Kernel_traits::Cta_tile_p::N;
+    if (params.seqlen_k == blocksize_c) {
+        fmha::device_1xN_<Kernel_traits, Is_dropout, Is_causal, Return_softmax, true, true>(params, bidb, bidh, 0, STEPS, ph0, ph1, 0);
+    } else {
+        const int max_loop_steps = (params.seqlen_k + blocksize_c - 1) / blocksize_c;
+        fmha::device_1xN_<Kernel_traits, Is_dropout, Is_causal, Return_softmax, true, false>(params, bidb, bidh, 0, STEPS, ph0, ph1, 0);
+        for (int loop_step_idx = 1; loop_step_idx < max_loop_steps - 1; loop_step_idx++) {
+            fmha::device_1xN_<Kernel_traits, Is_dropout, Is_causal, Return_softmax, false, false>(params, bidb, bidh, 0, STEPS, ph0, ph1, loop_step_idx);
+        }
+        fmha::device_1xN_<Kernel_traits, Is_dropout, Is_causal, Return_softmax, false, true>(params, bidb, bidh, 0, STEPS, ph0, ph1, max_loop_steps - 1);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fprop_kernel_dispatch.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fprop_kernel_dispatch.cu
new file mode 100644
index 000000000000..344aa07dd3b5
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fprop_kernel_dispatch.cu
@@ -0,0 +1,134 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#include <ATen/native/transformers/cuda/flash_attn/philox.cuh>
+#include <ATen/native/transformers/cuda/flash_attn/static_switch.h>
+#include <ATen/native/transformers/cuda/flash_attn/fmha.h>
+#include <ATen/native/transformers/cuda/flash_attn/kernel_traits.h>
+#include <ATen/native/transformers/cuda/flash_attn/fmha_fprop_kernel_1xN.h>
+
+template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Return_softmax>
+__global__ void fmha_fprop_loop_kernel(FMHA_fprop_params params) {
+    fmha::device_1xN_loop<Kernel_traits, Is_dropout, Is_causal, Return_softmax>(params);
+}
+
+template<typename Kernel_traits>
+void run_fmha_loop_(Launch_params<FMHA_fprop_params> &launch_params,
+                    const bool configure) {
+    constexpr int blocksize_c = Kernel_traits::Cta_tile_p::N;
+    const int loop_steps = (launch_params.params.seqlen_k + blocksize_c - 1) / blocksize_c;
+
+    if (configure) {
+        using Mma_tile_p = fmha::Hmma_tile<typename Kernel_traits::Cta_tile_p>;
+        constexpr int M = Kernel_traits::Cta_tile_p::M;
+        size_t STEPS = (launch_params.params.seqlen_q + M - 1) / M;
+        constexpr size_t MMAS_M = Mma_tile_p::MMAS_M;
+        constexpr size_t MMAS_N = Mma_tile_p::MMAS_N;
+        size_t elts_per_head = STEPS * MMAS_M * MMAS_N * 8 * loop_steps;
+        launch_params.elts_per_thread = elts_per_head;
+        return;
+    }
+
+    constexpr size_t smem_size_softmax_lse = Kernel_traits::Smem_softmax_lse::BYTES_PER_TILE;
+    // Don't need smem_size_softmax_lse if we're not looping
+    const size_t smem_size = fmha::get_dynamic_smem_size<Kernel_traits>()
+        + (loop_steps > 1 ? smem_size_softmax_lse : 0);
+    // printf("smem_size = %d\n", smem_size);
+
+    // Work-around for gcc 7. It doesn't like nested BOOL_SWITCH.
+    // https://github.com/kokkos/kokkos-kernels/issues/349
+    // https://github.com/HazyResearch/flash-attention/issues/21
+    BOOL_SWITCH(launch_params.is_dropout, IsDropoutConst, [&] {
+        auto kernel = launch_params.params.is_causal
+            ? (launch_params.return_softmax
+               ? &fmha_fprop_loop_kernel<Kernel_traits, IsDropoutConst, true, true>
+               : &fmha_fprop_loop_kernel<Kernel_traits, IsDropoutConst, true, false>)
+            : (launch_params.return_softmax
+               ? &fmha_fprop_loop_kernel<Kernel_traits, IsDropoutConst, false, true>
+               : &fmha_fprop_loop_kernel<Kernel_traits, IsDropoutConst, false, false>);
+        // constexpr bool IsDropoutConstTmp = false;
+        // auto kernel = launch_params.params.is_causal
+        //     ? (launch_params.return_softmax
+        //        ? &fmha_fprop_loop_kernel<Kernel_traits, IsDropoutConstTmp, true, true>
+        //        : &fmha_fprop_loop_kernel<Kernel_traits, IsDropoutConstTmp, true, false>)
+        //     : (launch_params.return_softmax
+        //        ? &fmha_fprop_loop_kernel<Kernel_traits, IsDropoutConstTmp, false, true>
+        //        : &fmha_fprop_loop_kernel<Kernel_traits, IsDropoutConstTmp, false, false>);
+        if( smem_size >= 48L  * 1024 ) {
+            FMHA_CHECK_CUDA(cudaFuncSetAttribute(
+                kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+        }
+        dim3 grid(launch_params.params.b, launch_params.params.h);
+        kernel<<<grid, Kernel_traits::THREADS, smem_size, launch_params.stream>>>(
+            launch_params.params);
+        FMHA_CHECK_CUDA(cudaPeekAtLastError());
+    });
+}
+
+void run_fmha_fprop(Launch_params<FMHA_fprop_params> &launch_params,
+                    const bool configure) {
+    BOOL_SWITCH(launch_params.params.is_bf16, IsBf16Const, [&] {
+        using elem_type = std::conditional<IsBf16Const, cutlass::bfloat16_t, cutlass::half_t>::type;
+        auto dprops = at::cuda::getCurrentDeviceProperties();
+        if (launch_params.params.d <= 64) {
+            if( launch_params.params.seqlen_k == 128 ) {
+                // TD [2022-08-20]: One might expect that not sharing the smem between K & V
+                // could be faster, but seems like it's the same speed.
+                using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 4, 0x08u, elem_type>;
+                run_fmha_loop_<Kernel_traits>(launch_params, configure);
+            } else if( launch_params.params.seqlen_k >= 256 ) {
+                if (dprops->major == 8 && dprops->minor >= 0) {
+                    using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 4, 0x08u, elem_type>;
+                    run_fmha_loop_<Kernel_traits>(launch_params, configure);
+                } else if (dprops->major == 7 && dprops->minor == 5) {
+                    if (launch_params.is_dropout) { // Need to use the same block size as backward
+                        using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 4, 0x08u, elem_type>;
+                        run_fmha_loop_<Kernel_traits>(launch_params, configure);
+                    } else {
+                        using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 4, 0x08u, elem_type>;
+                        run_fmha_loop_<Kernel_traits>(launch_params, configure);
+                    }
+                }
+            }
+        } else if (launch_params.params.d <= 128) {
+            if( launch_params.params.seqlen_k == 128 ) {
+                using Kernel_traits = FMHA_kernel_traits<128, 128, 16, 1, 4, 0x08u, elem_type>;
+                run_fmha_loop_<Kernel_traits>(launch_params, configure);
+            } else {
+                if (dprops->major == 8 && dprops->minor == 0 && !launch_params.is_dropout) {
+                    // TD [2022-06-05] Keep K in smem to reduce register spilling
+                    // Gives about 6% speedup compared to using block size 128.
+                    using Kernel_traits = FMHA_kernel_traits<256, 128, 16, 1, 4, 0x18u, elem_type>;
+                    // using Kernel_traits = FMHA_kernel_traits<128, 128, 16, 1, 4, 0x08u, elem_type>;
+                    run_fmha_loop_<Kernel_traits>(launch_params, configure);
+                } else {  // Need to use the same block size as backward
+                    using Kernel_traits = FMHA_kernel_traits<128, 128, 16, 1, 4, 0x08u, elem_type>;
+                    run_fmha_loop_<Kernel_traits>(launch_params, configure);
+                }
+            }
+        }
+    });
+}
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_kernel.h b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_kernel.h
new file mode 100644
index 000000000000..a321e839b3bb
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_kernel.h
@@ -0,0 +1,71 @@
+/******************************************************************************
+ * Copyright (c) 2022, Tri Dao.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int THREADS_PER_CTA>
+struct BlockInfoPadded {
+
+    template<typename Params>
+    __device__ BlockInfoPadded(const Params &params,
+                               const int bidb,
+                               const int bidh,
+                               const int tidx)
+        : bidb(bidb), bidh(bidh), h(params.h) {
+
+        // The block index.
+        sum_s_k = params.cu_seqlens_k[bidb];
+        actual_seqlen_k = params.cu_seqlens_k[bidb + 1] - sum_s_k;
+        sum_s_q = params.cu_seqlens_q[bidb];
+        actual_seqlen_q = params.cu_seqlens_q[bidb + 1] - sum_s_q;
+
+        tidx_global = (bidb * params.h + bidh) * THREADS_PER_CTA + tidx;
+    }
+
+    __device__ bool stop_early(const int start_col = 0) const {
+        return actual_seqlen_k <= start_col;
+    }
+
+    uint32_t actual_seqlen_q;
+    uint32_t actual_seqlen_k;
+    uint32_t sum_s_q;
+    uint32_t sum_s_k;
+    uint32_t bidh;
+    uint32_t bidb;
+    uint32_t tidx_global;
+    uint32_t h;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_utils.h b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_utils.h
new file mode 100644
index 000000000000..9a40ecb59f24
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_utils.h
@@ -0,0 +1,52 @@
+
+
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <cuda_runtime_api.h>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define FMHA_CHECK_CUDA( call )                                                                    \
+    do {                                                                                           \
+        cudaError_t status_ = call;                                                                \
+        if( status_ != cudaSuccess ) {                                                             \
+            fprintf( stderr,                                                                       \
+                     "CUDA error (%s:%d): %s\n",                                                   \
+                     __FILE__,                                                                     \
+                     __LINE__,                                                                     \
+                     cudaGetErrorString( status_ ) );                                              \
+            exit( 1 );                                                                             \
+        }                                                                                          \
+    } while( 0 )
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/gemm.h b/aten/src/ATen/native/transformers/cuda/flash_attn/gemm.h
new file mode 100644
index 000000000000..2753e5e52572
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/gemm.h
@@ -0,0 +1,95 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <ATen/native/transformers/cuda/flash_attn/utils.h>
+
+#include <cutlass/cutlass.h>
+#include <cutlass/gemm/warp/default_mma_tensor_op.h>
+#include <cutlass/layout/layout.h>
+#include <cutlass/arch/mma.h>
+#include <cutlass/array.h>
+#include <cutlass/numeric_types.h>
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    // The number of rows in the CTA tile.
+    int M_,
+    // The number of cols in the CTA tile.
+    int N_,
+    // The number of elements in the the K dimension of the GEMM loop.
+    int K_,
+    // The number of rows of warps.
+    int WARPS_M_,
+    // The number of cols of warps.
+    int WARPS_N_,
+    // The number of warps in the K dimension of the GEMM loop.
+    int WARPS_K_>
+struct Cta_tile_ {
+
+    static constexpr int M = M_, N = N_, K = K_;
+    // The number of warps.
+    static constexpr int WARPS_M = WARPS_M_, WARPS_N = WARPS_N_, WARPS_K = WARPS_K_;
+    // The number of warps per CTA.
+    static constexpr int WARPS_PER_CTA = WARPS_M * WARPS_N * WARPS_K;
+    // The number of threads per warp.
+    static constexpr int THREADS_PER_WARP = 32;
+    // The number of threads per CTA.
+    static constexpr int THREADS_PER_CTA = WARPS_PER_CTA * THREADS_PER_WARP;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Cta_tile>
+struct Hmma_tile {
+    // The number of elements computed with a single warp-MMA.
+    static constexpr int M_PER_MMA = 16, N_PER_MMA = 16, K_PER_MMA = 16;
+
+    // The number of elements computed with a single CTA-MMA.
+    static constexpr int M_PER_MMA_PER_CTA = M_PER_MMA * Cta_tile::WARPS_M,
+        N_PER_MMA_PER_CTA = N_PER_MMA * Cta_tile::WARPS_N,
+        K_PER_MMA_PER_CTA = K_PER_MMA * Cta_tile::WARPS_K;
+
+    // The number of MMAs needed to compute the GEMM.
+    static constexpr int MMAS_M = DivUpConstexpr(Cta_tile::M, M_PER_MMA_PER_CTA),
+        MMAS_N = DivUpConstexpr(Cta_tile::N, N_PER_MMA_PER_CTA),
+        MMAS_K = DivUpConstexpr(Cta_tile::K, K_PER_MMA_PER_CTA);
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int M, int N, int K, int WARPS_M, int WARPS_N, int WARPS_K>
+using Cta_tile_extd = Cta_tile_<M, N, K, WARPS_M, WARPS_N, WARPS_K>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/gmem_tile.h b/aten/src/ATen/native/transformers/cuda/flash_attn/gmem_tile.h
new file mode 100644
index 000000000000..0102c0611be4
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/gmem_tile.h
@@ -0,0 +1,272 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <ATen/native/transformers/cuda/flash_attn/gemm.h>
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Cta_tile, int BYTES_PER_ELEMENT >
+struct Gmem_tile_mma_sd {
+
+    // The mma tile.
+    using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+
+    // Each STG stores 8 elements.
+    static constexpr int BYTES_PER_STG = BYTES_PER_ELEMENT * 8;
+    // The number of MMAs in the M dimension.
+    static constexpr int MMAS_M = Mma_tile::MMAS_M;
+    // The number of MMAs in the N dimension.
+    static constexpr int MMAS_N = Mma_tile::MMAS_N;
+    // The number of rows computed per MMA per thread block.
+    static constexpr int M_PER_MMA_PER_CTA = Mma_tile::M_PER_MMA_PER_CTA;
+    // The number of cols computed per MMA per thread block.
+    static constexpr int N_PER_MMA_PER_CTA = Mma_tile::N_PER_MMA_PER_CTA;
+    // The number of threads per block.
+    static constexpr int THREADS_PER_CTA = Cta_tile::THREADS_PER_CTA;
+    // The size of each row in bytes. I.e. how many bytes are stored per STG.
+    static constexpr int BYTES_PER_ROW = THREADS_PER_CTA * BYTES_PER_STG;
+    // The distance between elements stored per loop (in bytes).
+    static constexpr int LOOP_STRIDE_BYTES = MMAS_M * MMAS_N * BYTES_PER_ROW;
+
+    // The type of elements stored per STG.
+    using Type = typename fmha::Uint_from_size_in_bytes<BYTES_PER_STG>::Type;
+
+    // Ctor.
+    template<typename Params>
+    inline __device__ Gmem_tile_mma_sd(void *ptr, const Params &params, const int bidb, const int bidh, const int tidx)
+        : ptr_(static_cast<char *>(ptr)) {
+
+        // The block index.
+        // size_t bidx = bidb * params.h + bidh;
+        uint32_t bidx = bidb * params.h + bidh;
+
+        // The distance between two blocks (in bytes).
+        // const size_t block_stride_bytes = params.seqlen_q * params.seqlen_k * BYTES_PER_ELEMENT;
+        const uint32_t block_stride_bytes = params.seqlen_q * params.seqlen_k * BYTES_PER_ELEMENT;
+        // Set store location for each thread at the beginning of the loop
+        ptr_ += bidx * block_stride_bytes + tidx * BYTES_PER_STG;
+    }
+
+    // Store to global memory.
+    inline __device__ void store(const Type &data, const int mi, const int ni) {
+        // size_t offset = (mi * MMAS_N + ni) * BYTES_PER_ROW;
+        uint32_t offset = (mi * MMAS_N + ni) * BYTES_PER_ROW;
+        fmha::stg(ptr_ + offset, data);
+    }
+
+    // Load from global memory.
+    inline __device__ void load(Type &data, const int mi, const int ni) {
+        // size_t offset = (mi * MMAS_N + ni) * BYTES_PER_ROW;
+        uint32_t offset = (mi * MMAS_N + ni) * BYTES_PER_ROW;
+        fmha::ldg(data, ptr_ + offset);
+    }
+
+    // Move to the next tile.
+    inline __device__ void move(const int steps = 1) {
+        ptr_ += LOOP_STRIDE_BYTES * steps;
+    }
+
+    // The pointer in global memory.
+    char *ptr_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Cta_tile, typename Base = Gmem_tile_mma_sd<Cta_tile, sizeof(uint16_t)> >
+struct Gmem_tile_mma_s : public Base {
+
+    // The number of mmas in the vertical dimension.
+    static constexpr int M = Base::MMAS_M;
+    // The number of mmas in the horizontal dimension.
+    static constexpr int N = Base::MMAS_N;
+    // The type of the vectors stored by each STG.
+    using Type = typename Base::Type;
+
+    // Ctor.
+    template< typename Params, typename Block_info >
+    inline __device__ Gmem_tile_mma_s(const Params &params, const Block_info& binfo, const int tidx)
+        : Base(params.s_ptr, params, binfo.bidb, binfo.bidh, tidx) {
+    }
+
+    // Store to global memory.
+    template<typename Mask, typename Fragment>
+    inline __device__ void store(const Fragment (&frag)[N][M], const Mask& mask){
+        static_assert(Fragment::kStorageElements == 4);
+        #pragma unroll
+        for( int mi = 0; mi < M; mi++ ) {
+            #pragma unroll
+            for( int ni = 0; ni < N; ni++ ) {
+                uint4 dst;
+                dst.x = frag[ni][mi].raw_data()[0];
+                dst.y = frag[ni][mi].raw_data()[2];
+                dst.z = frag[ni][mi].raw_data()[1];
+                dst.w = frag[ni][mi].raw_data()[3];
+                if( mask.any_valid(mi, ni) ) {
+                    Base::store(dst, mi, ni);
+                }
+            }
+        }
+    }
+
+    // Load from global memory.
+    template<typename Mask>
+    inline __device__ void load(uint4 (&regs)[M][N], const Mask &mask) {
+        #pragma unroll
+        for( int mi = 0; mi < M; mi++ ) {
+            #pragma unroll
+            for( int ni = 0; ni < N; ni++ ) {
+                regs[mi][ni] = make_uint4(0, 0, 0, 0);
+                if( mask.any_valid(mi, ni) ) {
+                    Base::load(regs[mi][ni], mi, ni);
+                }
+            }
+        }
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile
+>
+struct Gmem_summary_stats {
+
+    // The Mma tile.
+    using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+
+    // The number of MMAs in M/N dimensions.
+    static constexpr int MMAS_M = Mma_tile::MMAS_M;
+
+    // The size of each element.
+    static constexpr int BYTES_PER_ELEMENT = 4;
+    static constexpr int BYTES_PER_MMA = (Cta_tile::THREADS_PER_WARP / 4) * 2 * BYTES_PER_ELEMENT;
+    static constexpr int ROWS = Cta_tile::M;
+
+    // Ctor.
+    template<typename Params>
+    inline __device__ Gmem_summary_stats(void *ptr, const Params &params, const int tidx)
+        : ptr_(reinterpret_cast<char *>(ptr)), tidx_(tidx) {
+
+        // The block index for the batch.
+        const int bidb = blockIdx.x;
+        // The block index for the head.
+        const int bidh = blockIdx.y;
+        // The block index.
+        // size_t bidx = bidb * params.h + bidh;
+        uint32_t bidx = bidb * params.h + bidh;
+
+        // Extract the position in the warp.
+        int warp = tidx / Cta_tile::THREADS_PER_WARP;
+        int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+        // The distance between two blocks (in bytes).
+        // size_t block_stride_bytes = params.seqlen_q * BYTES_PER_ELEMENT;
+        uint32_t block_stride_bytes = params.seqlen_q * BYTES_PER_ELEMENT;
+
+        // Set store location for each thread at the beginning of the loop
+        ptr_row_ = ptr_ + bidx * block_stride_bytes;
+        ptr_ += bidx * block_stride_bytes + (lane / 4) * BYTES_PER_ELEMENT;
+    }
+
+    // Store data to global memory.
+    inline __device__ void store(const uint32_t (&data)[MMAS_M * 2]) {
+        int warp = tidx_ / Cta_tile::THREADS_PER_WARP;
+        int lane = tidx_ % Cta_tile::THREADS_PER_WARP;
+        if ((warp == 0) && (lane % 4 == 0)) {
+            #pragma unroll
+            for (int mi = 0; mi < MMAS_M; ++mi) {
+                // TODO: Not sure if it's right for MMAS_M > 1
+                fmha::stg(ptr_ + mi * BYTES_PER_MMA + 0 * BYTES_PER_ELEMENT, data[mi * 2 + 0]);
+                fmha::stg(ptr_ + mi * BYTES_PER_MMA + 8 * BYTES_PER_ELEMENT, data[mi * 2 + 1]);
+            }
+        }
+    }
+
+    // Store data to global memory.
+    inline __device__ void store_row(const uint32_t (&data)[MMAS_M], const int row) {
+        #pragma unroll
+        for (int mi = 0; mi < MMAS_M; ++mi) {
+            // TODO: Not sure if it's right for MMAS_M > 1
+            fmha::stg(ptr_row_ + mi * BYTES_PER_MMA + row * BYTES_PER_ELEMENT, data[mi]);
+        }
+    }
+
+    // Load from global memory.
+    inline __device__ void load(uint32_t (&data)[MMAS_M * 2]) {
+        #pragma unroll
+        for (int mi = 0; mi < MMAS_M; ++mi) {
+            // TODO: Not sure if it's right for MMAS_M > 1
+            fmha::ldg(data[mi * 2 + 0], ptr_ + mi * BYTES_PER_MMA + 0 * BYTES_PER_ELEMENT);
+            fmha::ldg(data[mi * 2 + 1], ptr_ + mi * BYTES_PER_MMA + 8 * BYTES_PER_ELEMENT);
+        }
+    }
+
+    // Load from global memory.
+    inline __device__ void load_next(uint32_t (&data)[MMAS_M * 2], int move_steps=1) {
+        char *ptr_next = ptr_ + move_steps * ROWS * BYTES_PER_ELEMENT;
+        #pragma unroll
+        for (int mi = 0; mi < MMAS_M; ++mi) {
+            // TODO: Not sure if it's right for MMAS_M > 1
+            fmha::ldg(data[mi * 2 + 0], ptr_next + mi * BYTES_PER_MMA + 0 * BYTES_PER_ELEMENT);
+            fmha::ldg(data[mi * 2 + 1], ptr_next + mi * BYTES_PER_MMA + 8 * BYTES_PER_ELEMENT);
+        }
+    }
+
+    // Store data to global memory.
+    template <int N>
+    inline __device__ void load_row(uint32_t (&data)[N], const int row[N]) {
+        #pragma unroll
+        for (int ni = 0; ni < N; ++ni) {
+            fmha::ldg(data[ni], ptr_row_ + row[ni] * BYTES_PER_ELEMENT);
+        }
+    }
+
+    // Move the pointer to the next location.
+    inline __device__ void move() {
+        ptr_ += ROWS * BYTES_PER_ELEMENT;
+        ptr_row_ += ROWS * BYTES_PER_ELEMENT;
+    }
+
+    // Move the pointer to the next location.
+    inline __device__ void move(const int steps) {
+        ptr_ += ROWS * BYTES_PER_ELEMENT * steps;
+        ptr_row_ += ROWS * BYTES_PER_ELEMENT * steps;
+    }
+
+    // The pointer.
+    char *ptr_;
+    char *ptr_row_;
+    const int tidx_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernel_traits.h b/aten/src/ATen/native/transformers/cuda/flash_attn/kernel_traits.h
new file mode 100644
index 000000000000..cfd8b8857781
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/kernel_traits.h
@@ -0,0 +1,154 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cutlass/cutlass.h>
+
+#include <cutlass/gemm/gemm.h>
+
+#include <cutlass/layout/layout.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/transform/threadblock/predicated_tile_iterator.h>
+
+#include <ATen/native/transformers/cuda/flash_attn/gemm.h>
+#include <ATen/native/transformers/cuda/flash_attn/gmem_tile.h>
+#include <ATen/native/transformers/cuda/flash_attn/summary_stats.h>
+#include <ATen/native/transformers/cuda/flash_attn/mma_core_sm75.h>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int S, int D, int STEP, int WARPS_M, int WARPS_N, uint32_t FLAGS = 0x08u, typename elem_type=cutlass::half_t>
+struct FMHA_kernel_traits {
+
+    // The CTA description for the 1st GEMM.
+    using Cta_tile_p = fmha::Cta_tile_extd<STEP, S, D, WARPS_M, WARPS_N, 1>;
+    // The CTA description for the 2nd GEMM.
+    using Cta_tile_o = fmha::Cta_tile_extd<STEP, D, S, WARPS_M, 1, WARPS_N>;
+
+    // Do we use one buffer for K and V.
+    static constexpr bool SHARE_SMEM_FOR_K_AND_V = (FLAGS & 0x08u) != 0u;
+    // Do we keep K in registers.
+    static constexpr bool K_IN_REGS = (FLAGS & 0x10u) == 0u;
+    // Do we keep V in registers.
+    static constexpr bool V_IN_REGS = (FLAGS & 0x100u) == 0u;
+
+    // The global memory tile to load/store S.
+    using Gmem_tile_s = fmha::Gmem_tile_mma_s<Cta_tile_p>;
+
+    // The global memory tile to store the softmax sum.
+    using Gmem_softmax_sum = fmha::Gmem_summary_stats<Cta_tile_p>;
+
+    // The number of threads.
+    static constexpr int THREADS = Cta_tile_p::THREADS_PER_CTA;
+    // Make sure the number of threads matches both CTAs.
+    static_assert(THREADS == Cta_tile_o::THREADS_PER_CTA, "");
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    using MmaInstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 750
+    using MmaInstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+#else
+    // using MmaInstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+    using MmaInstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+    // TD [2022-06-02] We don't support Volta (SM70) yet.
+#endif
+
+#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
+    using Element = elem_type;
+#else
+    using Element = cutlass::half_t;
+#endif
+    using ElementAccum = float;
+
+    static_assert(WARPS_M == 1);
+    using ThreadblockShapeQK = cutlass::gemm::GemmShape<STEP, S, D>;
+    using WarpCountQK = cutlass::gemm::GemmShape<WARPS_M, WARPS_N, 1>;
+    using WarpShapeQK = cutlass::gemm::GemmShape<
+       ThreadblockShapeQK::kM,
+       ThreadblockShapeQK::kN / WarpCountQK::kN, ThreadblockShapeQK::kK>;
+    using LayoutQ = cutlass::layout::RowMajor;
+    using LayoutK = cutlass::layout::ColumnMajor;
+    using LayoutP = cutlass::layout::RowMajor;
+    using MmaCoreQK = typename fmha::FMHAMmaCore<
+        ThreadblockShapeQK, WarpShapeQK, MmaInstructionShape, Element, LayoutQ,
+        Element, LayoutK, ElementAccum, LayoutP,
+        cutlass::arch::OpClassTensorOp>;
+
+    using ThreadblockShapePV = cutlass::gemm::GemmShape<STEP, D, S>;
+    using WarpCountPV = cutlass::gemm::GemmShape<WARPS_M, 1, WARPS_N>;
+    using WarpShapePV = cutlass::gemm::GemmShape<ThreadblockShapePV::kM, ThreadblockShapePV::kN, ThreadblockShapePV::kK / WarpCountPV::kK>;
+    using LayoutV = cutlass::layout::RowMajor;
+    using LayoutO = cutlass::layout::RowMajor;
+    using MmaCorePV = typename fmha::FMHAMmaCore<
+        ThreadblockShapePV, WarpShapePV, MmaInstructionShape, Element, LayoutP,
+        Element, LayoutV, ElementAccum, LayoutO,
+        cutlass::arch::OpClassTensorOp>;
+
+    // The global memory tile to load Q.
+    // Copy from mma_piplined_testbed.h
+    using GmemIteratorQ = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<ThreadblockShapeQK::kM, ThreadblockShapeQK::kK>,
+      Element,
+      LayoutQ,
+      0,
+      typename MmaCoreQK::IteratorThreadMapA
+    >;
+
+    // The global memory tile to load K.
+    using GmemIteratorK = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<ThreadblockShapeQK::kK, ThreadblockShapeQK::kN>,
+      Element,
+      LayoutK,
+      1,
+      typename MmaCoreQK::IteratorThreadMapB
+    >;
+
+    // The global memory tile to load V.
+    using GmemIteratorV = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<ThreadblockShapePV::kK, ThreadblockShapePV::kN>,
+      Element,
+      LayoutV,
+      0,
+      typename MmaCorePV::IteratorThreadMapB
+    >;
+
+    // The shared memory tile to store softmax lse.
+    using Smem_softmax_lse = fmha::Smem_tile_softmax_lse<ThreadblockShapeQK::kM, MmaInstructionShape::kM, WarpCountQK::kM>;
+
+    // The amount of shared memory needed to load Q and K.
+    static constexpr size_t BYTES_PER_SMEM_Q = ThreadblockShapeQK::kM * ThreadblockShapeQK::kK * sizeof(Element);
+    static constexpr size_t BYTES_PER_SMEM_K = ThreadblockShapeQK::kN * ThreadblockShapeQK::kK * sizeof(Element);
+    static constexpr size_t BYTES_PER_SMEM_V = ThreadblockShapePV::kN * ThreadblockShapePV::kK * sizeof(Element);
+    static_assert(BYTES_PER_SMEM_K == BYTES_PER_SMEM_V);
+    static constexpr size_t BYTES_PER_SMEM_QK = BYTES_PER_SMEM_Q + BYTES_PER_SMEM_K;
+    // The extra amount of shared memory needed to load V.
+    static constexpr size_t BYTES_PER_SMEM_V_EXTRA = SHARE_SMEM_FOR_K_AND_V ? 0u : BYTES_PER_SMEM_V;
+    // The amount of shared memory needed for Q, K and V..
+    static constexpr size_t BYTES_PER_SMEM_QKV = BYTES_PER_SMEM_QK + BYTES_PER_SMEM_V_EXTRA;
+
+};
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/mask.h b/aten/src/ATen/native/transformers/cuda/flash_attn/mask.h
new file mode 100644
index 000000000000..6169c89550b6
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/mask.h
@@ -0,0 +1,92 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+namespace fmha {
+
+
+template<typename Cta_tile, bool Is_causal=false>
+struct Mask {
+    using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+
+    template<typename BInfo>
+    __device__ Mask(const BInfo &binfo, int tidx, const int loop_step_idx_ = 0)
+        : actual_seqlen_k(binfo.actual_seqlen_k - loop_step_idx_ * Cta_tile::N)
+        , loop_step_idx(loop_step_idx_) {
+
+        const int warp = tidx / Cta_tile::THREADS_PER_WARP;
+        const int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+        static_assert(Cta_tile::WARPS_K == 1, "");
+
+        // find the warp in the Cta tile
+        const int warp_n = (warp / Cta_tile::WARPS_M);
+        const int warp_m = (warp % Cta_tile::WARPS_M);
+        // decompose warp into 8x4 tile
+        const int quad = lane / 4;
+        const int tid = (lane % 4) * 2;
+        row = warp_m * 16 + quad;
+        // col = warp_n * 16 + tid;
+        col = warp_n * Mma_tile::N_PER_MMA * Mma_tile::MMAS_N + tid;
+    }
+
+    inline __device__ bool is_valid(const int mi, const int ni, const int ii, const int jj) const {
+
+        // ii and jj iterate over the 2x4 fragment
+        // const int current_col = (Is_causal ? loop_step_idx * Cta_tile::N : 0) + ni * Mma_tile::N_PER_MMA_PER_CTA + col + (jj & 2) * 4 + (jj & 1);
+        // const int current_col = ni * Mma_tile::N_PER_MMA_PER_CTA + col + (jj & 2) * 4 + (jj & 1);
+        const int current_col = ni * Mma_tile::N_PER_MMA + col + (jj & 2) * 4 + (jj & 1);
+        const int current_row = row_offset + ii * 8;
+        const bool col_valid = current_col < actual_seqlen_k;
+        // const bool col_valid = (ni * Mma_tile::N_PER_MMA_PER_CTA + col + (jj & 2) * 4 + (jj & 1)) < actual_seqlen_k;
+        //&& (row + mi * Mma_tile::M_PER_MMA_PER_CTA + ii * 8) < actual_seqlen_k;
+        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+        //     printf("current_col=%d, current_row=%d, actual_seqlen_k=%d, col_valid=%d, all_valid=%d\n", current_col, current_row, actual_seqlen_k, col_valid, all_valid);
+        // }
+        return Is_causal ? col_valid && (current_col + loop_step_idx * Cta_tile::N <= current_row) : col_valid;
+        // return row_valid && col_valid;
+    }
+
+    //BERT Mask: if upper left is invalid, none are valid
+    inline __device__ bool any_valid(const int mi, const int ni) const {
+        return is_valid(mi, ni, 0, 0) || is_valid(mi, ni, 1, 0);
+    }
+
+    inline __device__ void load(const int it) {
+        row_offset = it * Cta_tile::M + row;
+    }
+    int row_offset;
+
+    int row;
+    int col;
+    const int loop_step_idx;
+    const int actual_seqlen_k;
+};
+
+}  // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/mma_core_sm75.h b/aten/src/ATen/native/transformers/cuda/flash_attn/mma_core_sm75.h
new file mode 100644
index 000000000000..863d30b14adf
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/mma_core_sm75.h
@@ -0,0 +1,382 @@
+// Adapted from cutlass/gemm/threadblock/default_mma_core_sm75.h
+// This is very similar, except we make it work for head_dim=128.
+// The original cutlass version only allows kK of the thread block to be
+// at most 64. Here we set kCrosswise = max(64, ThreadblockShape::kK) instead.
+
+/******************************************************************************
+ * Copyright (c) 2022, Tri Dao.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cutlass/cutlass.h>
+#include <cutlass/array.h>
+#include <cutlass/platform/platform.h>
+
+#include <cutlass/numeric_types.h>
+#include <cutlass/matrix_shape.h>
+
+#include <cutlass/layout/tensor_op_multiplicand_sm75.h>
+#include <cutlass/transform/pitch_linear_thread_map.h>
+#include <cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h>
+
+#include <cutlass/gemm/warp/default_mma_tensor_op.h>
+#include <cutlass/gemm/threadblock/default_mma_core.h>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace fmha {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template defininng default matrix multiply operators inferred from threadblock tile size,
+/// global memory data layout, and target math instruction.
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Operation performed by MMA
+    typename Operator = cutlass::arch::OpMultiplyAdd
+>
+struct FMHAMmaCore;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by MMA
+    typename Operator_>
+struct FMHAMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                   cutlass::layout::RowMajor, ElementB_, cutlass::layout::ColumnMajor,
+                   ElementC_, LayoutC_, cutlass::arch::OpClassTensorOp, Operator_
+                  > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = cutlass::gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisibility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = cutlass::gemm::warp::WarpSize<cutlass::arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Cutlass only supports Crosswise at most 64
+  static int const kCrosswise = std::min(Shape::kK, 64);
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      kCrosswise / (kAccessSizeInBits / cutlass::sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      kCrosswise / (kAccessSizeInBits / cutlass::sizeof_bits<ElementB>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementA>::value, kCrosswise>;
+
+  // Shared memory layout
+  using SmemLayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementB>::value, kCrosswise>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = cutlass::transform::PitchLinearWarpRakedThreadMap<
+      cutlass::layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      cutlass::layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                                        kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / cutlass::sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = cutlass::transform::threadblock::RegularTileIterator<
+    cutlass::MatrixShape<Shape::kM, Shape::kK>,
+    ElementA,
+    SmemLayoutA,
+    0,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = cutlass::transform::PitchLinearWarpRakedThreadMap<
+      cutlass::layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      cutlass::layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                                        kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / cutlass::sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = cutlass::transform::threadblock::RegularTileIterator<
+    cutlass::MatrixShape<Shape::kK, Shape::kN>,
+    ElementB,
+    SmemLayoutB,
+    1,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = cutlass::gemm::threadblock::MmaPolicy<
+    MmaTensorOp,
+    cutlass::MatrixShape<0, 0>,
+    cutlass::MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by MMA
+    typename Operator_>
+struct FMHAMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                   cutlass::layout::RowMajor, ElementB_, cutlass::layout::RowMajor, ElementC_,
+                   LayoutC_, cutlass::arch::OpClassTensorOp, Operator_
+                  > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = cutlass::gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = cutlass::gemm::warp::WarpSize<cutlass::arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Cutlass only supports Crosswise at most 64
+  static int const kCrosswise = std::min(Shape::kK, 64);
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      kCrosswise / (kAccessSizeInBits / cutlass::sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      cutlass::sizeof_bits<ElementA>::value, kCrosswise>;
+
+  // Shared memory layout
+  using SmemLayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      cutlass::sizeof_bits<ElementB>::value, int(128 / sizeof(ElementB))>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = cutlass::transform::PitchLinearWarpRakedThreadMap<
+      cutlass::layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      cutlass::layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                                        kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / cutlass::sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = cutlass::transform::threadblock::RegularTileIterator<
+    cutlass::MatrixShape<Shape::kM, Shape::kK>,
+    ElementA,
+    SmemLayoutA,
+    0,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = cutlass::transform::PitchLinearWarpRakedThreadMap<
+    cutlass::layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    cutlass::layout::PitchLinearShape<8, 4>,
+    kAccessSizeInBits / cutlass::sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = cutlass::transform::threadblock::RegularTileIterator<
+    cutlass::MatrixShape<Shape::kK, Shape::kN>,
+    ElementB,
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = cutlass::gemm::threadblock::MmaPolicy<
+    MmaTensorOp,
+    cutlass::MatrixShape<0, 0>,
+    cutlass::MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+
+} // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/philox.cuh b/aten/src/ATen/native/transformers/cuda/flash_attn/philox.cuh
new file mode 100644
index 000000000000..456b320b64ef
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/philox.cuh
@@ -0,0 +1,146 @@
+// Pytorch also has an implementation of Philox RNG: https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu
+#pragma once
+// Philox CUDA.
+
+#include <ATen/cuda/CUDAContext.h>
+
+namespace {
+
+class Philox {
+public:
+  __device__ inline Philox(unsigned long long seed,
+                           unsigned long long subsequence,
+                           unsigned long long offset)
+      : STATE(0)
+      , key(reinterpret_cast<const uint2&>(seed)) {
+    //key.x = (unsigned int)seed;
+    //key.y = (unsigned int)(seed >> 32);
+    //counter = make_uint4(0, 0, 0, 0);
+    //counter.z = (unsigned int)(subsequence);
+    //counter.w = (unsigned int)(subsequence >> 32);
+    //STATE = 0;
+    //incr_n(offset / 4);
+
+    // key = reinterpret_cast<const uint2&>(seed);
+    ull2 * tmp = reinterpret_cast<ull2*>(&counter);
+    tmp->x = offset / 4;
+    tmp->y = subsequence;
+    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+    //     printf("Philox counter: %d, %d, %d, %d\n", counter.x, counter.y, counter.z, counter.w);
+    // }
+  }
+  __device__ inline uint4 operator()() {
+    // if (STATE == 0) {
+      uint4 counter_ = counter;
+      uint2 key_ = key;
+      // 7-round philox
+      #pragma unroll
+      for (int i = 0; i < 6; i++) {
+        counter_ = single_round(counter_, key_);
+        key_.x += (kPhilox10A);
+        key_.y += (kPhilox10B);
+      }
+      // output = single_round(counter_, key_);
+      uint4 output = single_round(counter_, key_);
+      // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+      //     printf("Philox counter: %u, %u, %u, %u\n", counter.x, counter.y, counter.z, counter.w);
+      //     printf("Philox output: %u, %u, %u, %u\n", output.x, output.y, output.z, output.w);
+      // }
+      incr();
+    // }
+    // return a float4 directly
+    // unsigned long ret;
+    // switch(STATE) {
+    //  case 0: ret = output.x; break;
+    //  case 1: ret = output.y; break;
+    //  case 2: ret = output.z; break;
+    //  case 3: ret = output.w; break;
+    //}
+    // STATE = (STATE + 1) % 4;
+    return output;
+  }
+
+private:
+  struct ull2 {
+      uint64_t x;
+      uint64_t y;
+  };
+  uint4 counter;
+  // uint4 output;
+  const uint2 key;
+  unsigned int STATE;
+  __device__ inline void incr_n(unsigned long long n) {
+    unsigned int nlo = (unsigned int)(n);
+    unsigned int nhi = (unsigned int)(n >> 32);
+    counter.x += nlo;
+    if (counter.x < nlo)
+      nhi++;
+    counter.y += nhi;
+    if (nhi <= counter.y)
+      return;
+    if (++counter.z)
+      return;
+    ++counter.w;
+  }
+
+  __device__ uint4 incr128 (uint4 ctr)
+  {
+    uint4 res;
+    asm ("add.cc.u32      %0, %4, %8;\n\t"
+         "addc.cc.u32     %1, %5, %9;\n\t"
+         "addc.cc.u32     %2, %6, %10;\n\t"
+         "addc.u32        %3, %7, %11;\n\t"
+         : "=r"(res.x), "=r"(res.y), "=r"(res.z), "=r"(res.w)
+         : "r"(ctr.x), "r"(ctr.y), "r"(ctr.z), "r"(ctr.w),
+           "n"(1), "n"(0), "n"(0), "n"(0));
+    return res;
+  }
+
+  __device__ inline void incr() {
+    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+    //     printf("Counter before: %u, %u, %u, %u\n", counter.x, counter.y, counter.z, counter.w);
+    // }
+    counter = incr128(counter);
+    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+    //     printf("Counter after: %u, %u, %u, %u\n", counter.x, counter.y, counter.z, counter.w);
+    // }
+  }
+  __device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
+                                    unsigned int *result_high) {
+    *result_high = __umulhi(a, b);
+    return a * b;
+  }
+  __device__ uint2 mulhilo32_v2 (const unsigned int a, const unsigned int b)
+  {
+    uint2 *res;
+    unsigned long long tmp;
+    asm ("mul.wide.u32      %0, %1, %2;\n\t"
+         : "=l"(tmp)
+         : "r"(a), "r"(b));
+    res = (uint2*)(&tmp);
+    return *res;
+  }
+  __device__ inline uint4 single_round(const uint4 ctr, const uint2 key) {
+    //unsigned int hi0;
+    //unsigned int hi1;
+    //unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
+    //unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
+    //uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
+    uint2 res0 = mulhilo32_v2(kPhiloxSA, ctr.x);
+    uint2 res1 = mulhilo32_v2(kPhiloxSB, ctr.z);
+    uint4 ret = {res1.y ^ ctr.y ^ key.x, res1.x, res0.y ^ ctr.w ^ key.y, res0.x};
+    return ret;
+  }
+  static const unsigned long kPhilox10A = 0x9E3779B9;
+  static const unsigned long kPhilox10B = 0xBB67AE85;
+  static const unsigned long kPhiloxSA = 0xD2511F53;
+  static const unsigned long kPhiloxSB = 0xCD9E8D57;
+};
+// Inverse of 2^32.
+constexpr float M_RAN_INVM32 = 2.3283064e-10f;
+__device__ __inline__ float4 uniform4(const uint4 x) {
+  return make_float4(x.x * M_RAN_INVM32, x.y * M_RAN_INVM32, x.z * M_RAN_INVM32,
+                     x.w * M_RAN_INVM32);
+}
+
+} // namespace
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/softmax.h b/aten/src/ATen/native/transformers/cuda/flash_attn/softmax.h
new file mode 100644
index 000000000000..6af873aa336f
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/softmax.h
@@ -0,0 +1,446 @@
+/******************************************************************************
+ * Copyright (c) 2022, Tri Dao.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cmath>
+#include <ATen/native/transformers/cuda/flash_attn/philox.cuh>
+
+#include <cutlass/cutlass.h>
+#include <cutlass/array.h>
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float apply_exp_(float x, float max) {
+    return __expf(x - max);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float apply_exp2_(float x, float max) {
+    return exp2f(x - max);
+    // With fast-math, this produces the same PTX instruction as the assembly below
+    // float diff = x - max;
+    // float res;
+    // asm ("ex2.approx.ftz.f32 %0, %1;\n\t" : "=f"(res) : "f"(diff));
+    // return res;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int COLS> struct ReadType {};
+template<> struct ReadType<4> { using T = float;};
+template<> struct ReadType<8> { using T = float2;};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Kernel_traits>
+struct Smem_tile_reduce {
+    // Helper class to distribute MMA tiles reduced over rows per warp over quads.
+
+    // The Mma tile.
+    using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+
+    // The number of MMAs in M/N dimensions.
+    static constexpr int MMAS_M = Mma_tile::MMAS_M;
+    static constexpr int MMAS_N = Mma_tile::MMAS_N;
+
+    static constexpr int WARPS_M = Cta_tile::WARPS_M;
+    static constexpr int WARPS_N = Cta_tile::WARPS_N;
+
+
+    static constexpr int ROWS = WARPS_M * MMAS_M * 16;
+    static constexpr int COLS = WARPS_N;
+    static_assert(COLS == 4 || COLS == 8);
+    static constexpr int ROWS_PER_XOR_PATTERN = (COLS == 8) ? 4 : 8;
+    static constexpr int BYTES_PER_TILE = ROWS * COLS * sizeof(float);
+    static constexpr int ELTS_PER_TILE = ROWS * COLS;
+
+    using read_t = typename ReadType<COLS>::T;
+
+    __device__ inline Smem_tile_reduce(float *smem_, const int tidx) {
+
+        int lane = tidx % 32;
+        int warp = tidx / 32;
+
+        int warp_m = warp % WARPS_M;
+        int warp_n = warp / WARPS_M;
+
+        qid_ = lane % 4;
+        int qp = lane / 4;
+
+        // Swizzle the column to avoid 2-fold bank conflicts when we have 8 warps.
+        // This won't affect reading as we assume commutative reduction ops.
+        const int col = warp_n ^ (qp / ROWS_PER_XOR_PATTERN);
+        smem_write_ = &smem_[warp_m * 16 * MMAS_M * WARPS_N + qp * WARPS_N + col];
+        smem_read_ = &reinterpret_cast<read_t *>(smem_)[warp_m * 16 * MMAS_M * 4 + qp * 4 + qid_];
+        smem_read_row_ = &reinterpret_cast<read_t *>(smem_)[warp_m * 16 * MMAS_M * 4 + qid_];
+
+    }
+
+    __device__ inline void store(float (&frag)[2 * MMAS_M]) {
+        if( qid_ == 0 ) {
+            #pragma unroll
+            for( int mi = 0; mi < MMAS_M; mi++ ) {
+                int offset = mi * 16 * WARPS_N;
+                smem_write_[offset + 0 * 8 * WARPS_N] = frag[mi * 2 + 0];
+                smem_write_[offset + 1 * 8 * WARPS_N] = frag[mi * 2 + 1];
+            }
+        }
+    }
+
+    __device__ inline void load(read_t (&frag)[2 * MMAS_M]) {
+        #pragma unroll
+        for( int mi = 0; mi < MMAS_M; mi++ ) {
+            int offset = mi * 16 * 4;
+            frag[mi * 2 + 0] = smem_read_[offset + 0 * 8 * 4];
+            frag[mi * 2 + 1] = smem_read_[offset + 1 * 8 * 4];
+        }
+    }
+
+    __device__ inline void load_row(read_t (&frag)[MMAS_M], int row) {
+        #pragma unroll
+        for( int mi = 0; mi < MMAS_M; mi++ ) {
+            int offset = mi * 16 * 4;
+            frag[mi] = smem_read_row_[offset + 0 * 8 * 4 + row * 4];
+        }
+    }
+
+    int qid_;
+    float *smem_write_;
+    read_t *smem_read_;
+    read_t *smem_read_row_;
+
+};
+
+
+template<typename Cta_tile, typename Kernel_traits>
+struct Softmax_base {
+
+    // The Mma tile.
+    using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+
+    // The number of MMAs in M/N dimensions.
+    static constexpr int MMAS_M = Mma_tile::MMAS_M;
+    static constexpr int MMAS_N = Mma_tile::MMAS_N;
+
+    // The number of groups of warp such that we have at most 4 warps writing consecutive elements.
+    static constexpr int GROUPS = fmha::DivUpConstexpr(Cta_tile::WARPS_N, 4);
+    // The number of elements that we are going to store per row.
+    static constexpr int ELEMENTS_PER_ROW = Cta_tile::WARPS_N / GROUPS;
+    // The number of rows.
+    static constexpr int ROWS = Cta_tile::M * GROUPS;
+    // The total number of elements.
+    static constexpr int ELEMENTS = ROWS * ELEMENTS_PER_ROW;
+
+    // Ctor.
+    template<typename Params>
+    inline __device__ Softmax_base(const Params &params, void *smem, int tidx)
+        :  // packed_mask_ptr_(reinterpret_cast<const char*>(params.packed_mask_ptr)),
+          smem_(reinterpret_cast<float *>(smem)), tidx_(tidx) {
+
+        // Extract the position in the warp.
+        int warp = tidx / Cta_tile::THREADS_PER_WARP;
+        int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+        // Decompose the warp index into M and N.
+        int warp_m = warp % Cta_tile::WARPS_M;
+        int warp_n = warp / Cta_tile::WARPS_M;
+
+        // Decompose the warp-n index into group/position-inside-the-group.
+        int warp_g = warp_n / ELEMENTS_PER_ROW;
+        int warp_i = warp_n % ELEMENTS_PER_ROW;
+
+        // The location written by the threads.
+        int write_row = warp_g * (ROWS / GROUPS) + warp_m * Mma_tile::M_PER_MMA + lane / 4;
+        int write_col = warp_i;
+
+        // Assemble the write pointer.
+        smem_write_ = &smem_[write_row * ELEMENTS_PER_ROW + write_col];
+
+        // Assemble the read pointer.
+        smem_read_ = &smem_[warp_m * Mma_tile::M_PER_MMA + lane / 4];
+    }
+
+    template<bool zero=false, typename Mask>
+    inline __device__ void apply_mask(const Mask &mask) {
+        #pragma unroll
+        for( int mi = 0; mi < MMAS_M; ++mi ) {
+            #pragma unroll
+            for( int ii = 0; ii < 2; ++ii ) {
+                #pragma unroll
+                for( int ni = 0; ni < MMAS_N; ++ni ) {
+                    #pragma unroll
+                    for( int jj = 0; jj < 4; ++jj ) {
+                        if( !mask.is_valid(mi, ni, ii, jj) ) {
+                            elt_[2 * mi + ii][4 * ni + jj] = zero ? 0.f : -INFINITY;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // Apply the exp to all the elements.
+    template <bool scale_max=true>
+    inline __device__ void scale_apply_exp(const float (&max)[MMAS_M * 2], const float scale_) {
+        const float max_scale = scale_max ? scale_ * M_LOG2E : M_LOG2E;
+        const float scale = scale_ * M_LOG2E;
+        #pragma unroll
+        for( int mi = 0; mi < MMAS_M * 2; ++mi ) {
+            // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+            // max * log_2(e)) This allows the compiler to use the ffma
+            // instruction instead of fadd and fmul separately.
+            const float max_scaled = max[mi] * max_scale;
+            #pragma unroll
+            for( int ni = 0; ni < MMAS_N * 4; ++ni ) {
+                elt_[mi][ni] = apply_exp2_(elt_[mi][ni] * scale, max_scaled);
+            }
+        }
+    }
+
+    template <bool encode_dropout_in_sign_bit=false>
+    inline __device__ void apply_dropout_16bits(Philox &ph, uint16_t p_dropout_in_uint16_t) {
+        // We encode the dropout pattern in the sign bit of the non-negative
+        // softmax to distinguish from pre-existing zeros
+        auto encode_dropout = [](bool keep, float val) {
+            return keep ? val : (encode_dropout_in_sign_bit ? -val : float(0));
+        };
+        #pragma unroll
+        for( int mi = 0; mi < MMAS_M; mi++ ) {
+            #pragma unroll
+            for( int ni = 0; ni < MMAS_N; ni++ ) {
+                uint4 random_uint4 = ph();
+                uint16_t (&rnd)[8] = reinterpret_cast<uint16_t (&)[8]>(random_uint4);
+                // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+                //     printf("ni = %d, ph  Philox: %u, %u, %u, %u\n", ni, rnd.x, rnd.y, rnd.z, rnd.w);
+                // }
+                #pragma unroll
+                for (int ii = 0; ii < 2; ++ii) {
+                    #pragma unroll
+                    for (int jj = 0; jj < 4; ++jj) {
+                        elt_[mi * 2 + ii][4 * ni + jj] =
+                            encode_dropout(rnd[ii * 4 + jj] <= p_dropout_in_uint16_t, elt_[mi * 2 + ii][4 * ni + jj]);
+                    }
+                }
+            }
+        }
+    }
+
+    template <bool encode_dropout_in_sign_bit=false>
+    inline __device__ void apply_dropout_16bits(Philox &ph0, Philox &ph1, uint16_t p_dropout_in_uint16_t) {
+        // We encode the dropout pattern in the sign bit of the non-negative
+        // softmax to distinguish from pre-existing zeros
+        auto encode_dropout = [](bool keep, float val) {
+            return keep ? val : (encode_dropout_in_sign_bit ? -val : float(0));
+        };
+        #pragma unroll
+        for( int mi = 0; mi < MMAS_M; mi++ ) {
+            static_assert(MMAS_N % 2 == 0);
+            #pragma unroll
+            for( int ni = 0; ni < MMAS_N; ni += 2 ) {
+                uint4 random_uint4 = ph0();
+                uint16_t (&rnd0)[8] = reinterpret_cast<uint16_t (&)[8]>(random_uint4);
+                // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+                //     printf("ni = %d, ph  Philox: %u, %u, %u, %u\n", ni, rnd0.x, rnd0.y, rnd0.z, rnd0.w);
+                // }
+                #pragma unroll
+                for (int ii = 0; ii < 2; ++ii) {
+                    #pragma unroll
+                    for (int jj = 0; jj < 4; ++jj) {
+                        elt_[mi * 2 + ii][4 * ni + jj] =
+                            encode_dropout(rnd0[ii * 4 + jj] <= p_dropout_in_uint16_t, elt_[mi * 2 + ii][4 * ni + jj]);
+                    }
+                }
+                random_uint4 = ph1();
+                uint16_t (&rnd1)[8] = reinterpret_cast<uint16_t (&)[8]>(random_uint4);
+                // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+                //     printf("ni = %d, ph  Philox: %u, %u, %u, %u\n", ni, rnd1.x, rnd1.y, rnd1.z, rnd1.w);
+                // }
+                #pragma unroll
+                for (int ii = 0; ii < 2; ++ii) {
+                    #pragma unroll
+                    for (int jj = 0; jj < 4; ++jj) {
+                        elt_[mi * 2 + ii][4 * (ni + 1) + jj] =
+                            encode_dropout(rnd1[ii * 4 + jj] <= p_dropout_in_uint16_t, elt_[mi * 2 + ii][4 * (ni + 1) + jj]);
+                    }
+                }
+            }
+        }
+    }
+
+    // Shared memory for the CTA-wide reduction.
+    float *smem_, *smem_write_, *smem_read_;
+    // The current thread index.
+    int tidx_;
+    // The elements.
+    float elt_[MMAS_M * 2][MMAS_N * 4];
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Cta_tile, typename Kernel_traits>
+struct Softmax : public Softmax_base<Cta_tile, Kernel_traits> {
+
+    // The base class.
+    using Base = Softmax_base<Cta_tile, Kernel_traits>;
+
+    static constexpr int WARPS_M = Cta_tile::WARPS_M;
+    static constexpr int WARPS_N = Cta_tile::WARPS_N;
+    // The MMAs.
+    static constexpr int MMAS_M = Base::MMAS_M;
+    static constexpr int MMAS_N = Base::MMAS_N;
+
+    using Smem_tile_red = Smem_tile_reduce<Cta_tile, Kernel_traits>;
+    static_assert(Smem_tile_red::ELTS_PER_TILE == Cta_tile::M * WARPS_N);
+    // Ctor.
+    template<typename Params>
+    inline __device__ Softmax(const Params &params, void *smem, int tidx)
+        : Base(params, smem, tidx)
+        , smem_sum_(static_cast<float*>(smem), tidx)
+        , smem_max_(static_cast<float*>(smem) + Smem_tile_red::ELTS_PER_TILE, tidx) {
+    }
+
+    // Pack the data to a fragment for the next GEMM.
+    inline __device__ void pack_noconvert(cutlass::Array<float, MMAS_M * MMAS_N * 8> &frag) const {
+        #pragma unroll
+        for( int mi = 0; mi < MMAS_M; ++mi ) {
+            #pragma unroll
+            for( int ki = 0; ki < MMAS_N; ++ki ) {
+                // 1st row - 4 elements per row.
+                frag[ki * MMAS_M * 8 + mi * 8 + 0] = this->elt_[2 * mi + 0][4 * ki + 0];
+                frag[ki * MMAS_M * 8 + mi * 8 + 1] = this->elt_[2 * mi + 0][4 * ki + 1];
+                frag[ki * MMAS_M * 8 + mi * 8 + 4] = this->elt_[2 * mi + 0][4 * ki + 2];
+                frag[ki * MMAS_M * 8 + mi * 8 + 5] = this->elt_[2 * mi + 0][4 * ki + 3];
+                // 2nd row - 4 elements per row.
+                frag[ki * MMAS_M * 8 + mi * 8 + 2] = this->elt_[2 * mi + 1][4 * ki + 0];
+                frag[ki * MMAS_M * 8 + mi * 8 + 3] = this->elt_[2 * mi + 1][4 * ki + 1];
+                frag[ki * MMAS_M * 8 + mi * 8 + 6] = this->elt_[2 * mi + 1][4 * ki + 2];
+                frag[ki * MMAS_M * 8 + mi * 8 + 7] = this->elt_[2 * mi + 1][4 * ki + 3];
+            }
+        }
+    }
+
+    template <typename FragmentC>
+    inline __device__ void unpack_noscale(const FragmentC (&acc)) {
+        static_assert(FragmentC::kElements == MMAS_M * MMAS_N * 8, "");
+        #pragma unroll
+        for( int mi = 0; mi < MMAS_M; ++mi ) {
+            #pragma unroll
+            for( int ni = 0; ni < MMAS_N; ++ni ) {
+                // 1st row - 4 elements per row.
+                this->elt_[2 * mi + 0][4 * ni + 0] = acc[mi * MMAS_N * 8 + ni * 8 + 0];
+                this->elt_[2 * mi + 0][4 * ni + 1] = acc[mi * MMAS_N * 8 + ni * 8 + 1];
+                this->elt_[2 * mi + 0][4 * ni + 2] = acc[mi * MMAS_N * 8 + ni * 8 + 4];
+                this->elt_[2 * mi + 0][4 * ni + 3] = acc[mi * MMAS_N * 8 + ni * 8 + 5];
+                // 2nd row - 4 elements per row.
+                this->elt_[2 * mi + 1][4 * ni + 0] = acc[mi * MMAS_N * 8 + ni * 8 + 2];
+                this->elt_[2 * mi + 1][4 * ni + 1] = acc[mi * MMAS_N * 8 + ni * 8 + 3];
+                this->elt_[2 * mi + 1][4 * ni + 2] = acc[mi * MMAS_N * 8 + ni * 8 + 6];
+                this->elt_[2 * mi + 1][4 * ni + 3] = acc[mi * MMAS_N * 8 + ni * 8 + 7];
+            }
+        }
+    }
+
+    template<bool zero_init=true, typename Operator>
+    __device__ inline void thread_reduce_(float (&frag)[2 * MMAS_M], Operator &op) {
+        #pragma unroll
+        for( int mi = 0; mi < 2 * MMAS_M; mi++ ) {
+            frag[mi] = zero_init ? this->elt_[mi][0] : op(frag[mi], this->elt_[mi][0]);
+            #pragma unroll
+            for( int ni = 1; ni < 4 * MMAS_N; ni++ ) {
+                frag[mi] = op(frag[mi], this->elt_[mi][ni]);
+            }
+        }
+    }
+
+    template<bool zero_init=true, typename Operator>
+    __device__ inline void reduce_(float (&frag)[2 * MMAS_M], Operator &op, Smem_tile_red & smem_red) {
+        thread_reduce_<zero_init>(frag, op);
+        quad_reduce(frag, frag, op);
+        smem_red.store(frag);
+        __syncthreads();
+        typename Smem_tile_red::read_t tmp[2 * MMAS_M];
+        smem_red.load(tmp);
+        quad_allreduce(frag, tmp, op);
+    }
+
+    template<bool zero_init=true>
+    __device__ inline void reduce_max(float (&frag)[2 * MMAS_M]){
+        MaxOp<float> max;
+        reduce_<zero_init>(frag, max, smem_max_);
+    }
+
+    __device__ inline void reduce_sum(float (&frag)[2 * MMAS_M]){
+        SumOp<float> sum;
+        reduce_(frag, sum, smem_sum_);
+    }
+
+    template<bool zero_init=true>
+    __device__ inline void reduce_sum_before_sync_(float (&frag)[2 * MMAS_M]){
+        SumOp<float> sum;
+        thread_reduce_<zero_init>(frag, sum);
+        quad_reduce(frag, frag, sum);
+        smem_sum_.store(frag);
+    }
+
+    template<int NROWS, typename Operator>
+    __device__ inline void reduce_after_sync_(float (&frag)[NROWS][MMAS_M],
+                                              const int (&rows)[NROWS],
+                                              Operator &op, Smem_tile_red & smem_red) {
+        #pragma unroll
+        for (int ii = 0; ii < NROWS; ii++) {
+            typename Smem_tile_red::read_t tmp[MMAS_M];
+            smem_red.load_row(tmp, rows[ii]);
+            quad_allreduce(frag[ii], tmp, op);
+        }
+    }
+
+    template<int NROWS>
+    __device__ inline void reduce_sum_after_sync_(float (&frag)[NROWS][MMAS_M],
+                                                  const int (&rows)[NROWS]){
+        SumOp<float> sum;
+        reduce_after_sync_(frag, rows, sum, smem_sum_);
+    }
+
+    template<int NROWS>
+    __device__ inline void reduce_max_after_sync_(float (&frag)[NROWS][MMAS_M],
+                                                  const int (&rows)[NROWS]){
+        MaxOp<float> max;
+        reduce_after_sync_(frag, rows, max, smem_max_);
+    }
+
+    Smem_tile_red smem_max_;
+    Smem_tile_red smem_sum_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/static_switch.h b/aten/src/ATen/native/transformers/cuda/flash_attn/static_switch.h
new file mode 100644
index 000000000000..7920ac045d0a
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/static_switch.h
@@ -0,0 +1,25 @@
+// Inspired by https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
+// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
+
+#pragma once
+
+/// @param COND       - a boolean expression to switch by
+/// @param CONST_NAME - a name given for the constexpr bool variable.
+/// @param ...       - code to execute for true and false
+///
+/// Usage:
+/// ```
+/// BOOL_SWITCH(flag, BoolConst, [&] {
+///     some_function<BoolConst>(...);
+/// });
+/// ```
+#define BOOL_SWITCH(COND, CONST_NAME, ...)                                           \
+    [&] {                                                                            \
+        if (COND) {                                                                  \
+            constexpr bool CONST_NAME = true;                                        \
+            return __VA_ARGS__();                                                    \
+        } else {                                                                     \
+            constexpr bool CONST_NAME = false;                                       \
+            return __VA_ARGS__();                                                    \
+        }                                                                            \
+    }()
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/summary_stats.h b/aten/src/ATen/native/transformers/cuda/flash_attn/summary_stats.h
new file mode 100644
index 000000000000..812aaea79772
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/summary_stats.h
@@ -0,0 +1,55 @@
+/******************************************************************************
+ * Copyright (c) 2022, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int kRows, int kRowsPerMma, int kWarpCountM>
+struct Smem_tile_softmax_lse {
+
+    static constexpr int kMmaM = (kRows / kWarpCountM) / kRowsPerMma;
+    static_assert(kMmaM * kRowsPerMma * kWarpCountM == kRows);
+    // static_assert(kWarpCountM == 1);
+    // Otherwise we might need to check warp_idx / kWarpCountM == 0 instead of just warp_idx == 0
+
+    // The size of one buffer in bytes in shared memory.
+    static constexpr size_t BYTES_PER_TILE = kRows * sizeof(float);
+
+    inline __device__ Smem_tile_softmax_lse(float *smem) : smem_(smem) {
+    }
+
+    inline __device__ void store_pair(const float (&sum)[kMmaM * 2]) {
+        // Broadcast the warp_id computed by lane 0 to ensure dependent code
+        // is compiled as warp-uniform.
+        // This makes a difference of 50us for BERT.
+        // const int warp_idx = threadIdx.x / 32;
+        const int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+        const int lane_idx =  threadIdx.x % 32;
+        const int warp_n = warp_idx / kWarpCountM;
+        // Extract the position in the warp.
+        const int row = lane_idx / 4;
+        if ((lane_idx % 4 == 0) && (warp_n == 0)) {
+            #pragma unroll
+            for (int mi = 0; mi < kMmaM; ++mi) {
+                smem_[mi * kRowsPerMma + row + 0] = sum[mi * 2 + 0];
+                smem_[mi * kRowsPerMma + row + 8] = sum[mi * 2 + 1];
+            }
+        }
+    }
+
+    template<int N>
+    inline __device__ void load(float (&sum)[N], const int (&row)[N]) {
+        #pragma unroll
+        for( int ni = 0; ni < N; ni++ ) {
+            sum[ni] = smem_[row[ni]];
+        }
+    }
+
+    float * const smem_;
+};
+
+}  // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/utils.h b/aten/src/ATen/native/transformers/cuda/flash_attn/utils.h
new file mode 100644
index 000000000000..e70f634c26dd
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/utils.h
@@ -0,0 +1,404 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+
+#include <ATen/cuda/CUDAContext.h>
+// #include <cuda_fp16.h>
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#include <cuda_bf16.h>
+#endif
+
+extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *ptr);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Row {};
+struct Col {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int M, int N >
+struct Div_up {
+    enum { VALUE = (M + N-1) / N };
+};
+
+constexpr int DivUpConstexpr(int M, int N) { return (M + N - 1) / N; }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int A, int B >
+struct Max {
+    enum { VALUE = A >= B ? A : B };
+};
+
+constexpr int MaxConstexpr(int A, int B) { return A >= B ? A : B; }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int A, int B, int C >
+struct Max_3 {
+    enum { VALUE = Max<Max<A, B>::VALUE, C>::VALUE };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int A, int B >
+struct Min {
+    enum { VALUE = A <= B ? A : B };
+};
+
+constexpr int MinConstexpr(int A, int B) { return A <= B ? A : B; }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int SIZE_IN_BYTES >
+struct Uint_from_size_in_bytes {
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+struct Uint_from_size_in_bytes<1> {
+    using Type = uint8_t;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+struct Uint_from_size_in_bytes<2> {
+    using Type = uint16_t;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+struct Uint_from_size_in_bytes<4> {
+    using Type = uint32_t;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+struct Uint_from_size_in_bytes<8> {
+    using Type = uint2;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+struct Uint_from_size_in_bytes<16> {
+    using Type = uint4;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename T >
+inline __device__ __host__ T div_up(T m, T n) {
+    return (m + n-1) / n;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+inline __device__ uint32_t hrelu2(uint32_t x);
+
+template<>
+inline __device__ uint32_t hrelu2<__half>(uint32_t x) {
+    uint32_t res;
+    const uint32_t zero = 0u;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    asm volatile( "max.f16x2 %0, %1, %2;\n" : "=r"(res) : "r"(x), "r"(zero));
+#else
+    asm volatile( \
+        "{\n" \
+        "\t .reg .f16x2 sela;\n" \
+        "\t set.gtu.u32.f16x2 sela, %1, %2;\n" \
+        "\t and.b32 %0, sela, %1;\n"
+        "}\n" : "=r"(res) : "r"(x), "r"(zero));
+#endif
+    return res;
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+template<>
+inline __device__ uint32_t hrelu2<__nv_bfloat16>(uint32_t x) {
+    uint32_t res;
+    const uint32_t zero = 0u;
+    asm volatile( "max.bf16x2 %0, %1, %2;\n" : "=r"(res) : "r"(x), "r"(zero));
+    return res;
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint16_t float_to_half(float f) {
+    uint16_t h;
+    asm volatile("cvt.rn.f16.f32 %0, %1;" : "=h"(h) : "f"(f));
+    return h;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+inline __device__ uint32_t float2_pack(float a, float b);
+
+template <>
+inline __device__ uint32_t float2_pack<__half>(float a, float b) {
+    __half2 result = __floats2half2_rn(a, b);
+    return reinterpret_cast<uint32_t(&)>(result);
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+template <>
+inline __device__ uint32_t float2_pack<__nv_bfloat16>(float a, float b) {
+    __nv_bfloat162 result = __floats2bfloat162_rn(a, b);
+    return reinterpret_cast<uint32_t(&)>(result);
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+inline __device__ uint2 float4_pack(float x, float y, float z, float w) {
+    uint2 d;
+    d.x = float2_pack<T>(x, y);
+    d.y = float2_pack<T>(z, w);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+inline __device__ float2 half2_unpack(uint32_t a);
+
+template <>
+inline __device__ float2 half2_unpack<__half>(uint32_t a) {
+    return __half22float2(reinterpret_cast<__half2 (&)>(a));
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+template <>
+inline __device__ float2 half2_unpack<__nv_bfloat16>(uint32_t a) {
+    return __bfloat1622float2(reinterpret_cast<__nv_bfloat162 (&)>(a));
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Convert two half2's or bf162's into float, then take their dot product.
+template <typename T>
+inline __device__ float hfma2_to_float(const uint32_t a, const uint32_t b) {
+    float2 af = fmha::half2_unpack<T>(a);
+    float2 bf = fmha::half2_unpack<T>(b);
+    return af.x * bf.x + af.y * bf.y;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Converted two vectors of 8 half's or bf16's into float, then take their dot product.
+template<typename T>
+inline __device__ float hmulsum8(const uint4 a, const uint4 b) {
+    float sum;
+    sum  = fmha::hfma2_to_float<T>(a.x, b.x);
+    sum += fmha::hfma2_to_float<T>(a.y, b.y);
+    sum += fmha::hfma2_to_float<T>(a.z, b.z);
+    sum += fmha::hfma2_to_float<T>(a.w, b.w);
+    return sum;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// L D G
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldg(uint8_t &dst, const void *ptr) {
+    dst = *reinterpret_cast<const uint8_t*>(ptr);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldg(uint16_t &dst, const void *ptr) {
+    dst = *reinterpret_cast<const uint16_t*>(ptr);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldg(uint32_t &dst, const void *ptr) {
+    dst = *reinterpret_cast<const uint32_t*>(ptr);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldg(uint2 &dst, const void *ptr) {
+    dst = *reinterpret_cast<const uint2*>(ptr);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldg(uint4 &dst, const void *ptr) {
+    dst = *reinterpret_cast<const uint4*>(ptr);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// S T G
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stg(void *ptr, uint8_t val) {
+    *reinterpret_cast<uint8_t*>(ptr) = val;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stg(void *ptr, uint16_t val) {
+    *reinterpret_cast<uint16_t*>(ptr) = val;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stg(void *ptr, uint32_t val) {
+    *reinterpret_cast<uint32_t*>(ptr) = val;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stg(void *ptr, uint2 val) {
+    *reinterpret_cast<uint2*>(ptr) = val;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stg(void *ptr, uint4 val) {
+    *reinterpret_cast<uint4*>(ptr) = val;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+struct MaxOp {
+__device__ inline T operator()(T const & x, T const & y) { return x > y ? x : y; }
+};
+
+template <>
+struct MaxOp<float> {
+// This is slightly faster
+__device__ inline float operator()(float const &x, float const &y) { return max(x, y); }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+struct SumOp {
+__device__ inline T operator()(T const & x, T const & y) { return x + y; }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int THREADS>
+struct Allreduce {
+    static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4);
+    template<typename T, typename Operator>
+    static __device__ inline T run(T x, Operator &op) {
+        constexpr int OFFSET = THREADS / 2;
+        x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET));
+        return Allreduce<OFFSET>::run(x, op);
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+struct Allreduce<2> {
+template<typename T, typename Operator>
+static __device__ inline T run(T x, Operator &op) {
+    x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1));
+    return x;
+}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Operator, int M>
+__device__ inline void  quad_reduce(float (&dst)[M], float (&src)[M], Operator &op) {
+    #pragma unroll
+    for(int mi=0; mi < M; mi++){
+        dst[mi] = src[mi];
+        dst[mi] = op(dst[mi], __shfl_down_sync(uint32_t(-1), dst[mi], 2));
+        dst[mi] = op(dst[mi], __shfl_down_sync(uint32_t(-1), dst[mi], 1));
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Operator, int M>
+__device__ inline void quad_reduce(float (&dst)[M], float2 (&src)[M], Operator &op) {
+    float tmp[M];
+    #pragma unroll
+    for(int mi=0; mi < M; mi++){
+        tmp[mi] = op(src[mi].x, src[mi].y);
+    }
+    quad_reduce(dst, tmp, op);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Operator, int M>
+__device__ inline void quad_allreduce(float (&dst)[M], float (&src)[M], Operator &op) {
+    #pragma unroll
+    for(int mi=0; mi < M; mi++){
+        dst[mi] = src[mi];
+        dst[mi] = Allreduce<4>::run(dst[mi], op);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Operator, int M>
+__device__ inline void quad_allreduce(float (&dst)[M], float2 (&src)[M], Operator &op) {
+    float tmp[M];
+    #pragma unroll
+    for(int mi=0; mi < M; mi++){
+        tmp[mi] = op(src[mi].x, src[mi].y);
+    }
+    quad_allreduce(dst, tmp, op);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/aten/src/ATen/native/transformers/transformer.cpp b/aten/src/ATen/native/transformers/transformer.cpp
index 2a641a40dfb5..b7ef1a36a7d9 100644
--- a/aten/src/ATen/native/transformers/transformer.cpp
+++ b/aten/src/ATen/native/transformers/transformer.cpp
@@ -107,20 +107,37 @@ Tensor transformer_encoder_layer_forward(
   if (norm_first) {
     x = norm(x, embed_dim, layer_norm_eps, layer_norm_weight_1, layer_norm_bias_1, use_nested_tensor);
   }
-  x = std::get<0>(native_multi_head_attention(
-      x,
-      x,
-      x,
-      embed_dim,
-      num_heads,
-      qkv_weight,
-      qkv_bias,
-      proj_weight,
-      proj_bias,
-      mask,
-      false /* need_weights */,
-      true /* average_attn_weights */,
-      mask_type));
+#if USE_FLASH_ATTENTION
+  if (x.is_nested() && x.is_cuda() && x.dtype() == at::kHalf && !mask.has_value() &&
+      (embed_dim / num_heads == 16 ||
+       embed_dim / num_heads == 32 ||
+       embed_dim / num_heads == 64 ||
+       embed_dim / num_heads == 128)) {
+     TORCH_WARN_ONCE("USING FLASH ATTENTION WITH NT");
+     x = at::linear(x, qkv_weight, qkv_bias);
+     x = x.view({x.size(0), -1, 3, num_heads, embed_dim / num_heads});
+     x = flash_attention_helper(x, x, x, 0.0, false);
+     x = x.view({-1, -1, embed_dim});
+     x = at::linear(x, proj_weight, proj_bias);
+  } else {
+#endif
+     x = std::get<0>(native_multi_head_attention(
+         x,
+         x,
+         x,
+         embed_dim,
+         num_heads,
+         qkv_weight,
+         qkv_bias,
+         proj_weight,
+         proj_bias,
+         mask,
+         false /* need_weights */,
+         true /* average_attn_weights */,
+         mask_type));
+#if USE_FLASH_ATTENTION
+  }
+#endif
   add_in_place(x, src, use_nested_tensor);
   if (!norm_first) {
     x = norm(x, embed_dim, layer_norm_eps, layer_norm_weight_1, layer_norm_bias_1, use_nested_tensor);
diff --git a/aten/src/ATen/native/ts_native_functions.yaml b/aten/src/ATen/native/ts_native_functions.yaml
index b110aa75c83d..e0cb3ecdef44 100644
--- a/aten/src/ATen/native/ts_native_functions.yaml
+++ b/aten/src/ATen/native/ts_native_functions.yaml
@@ -199,6 +199,15 @@ supported:
   - _trilinear
   - linalg_pinv.atol_rtol_tensor
   - logsumexp.out
+symint:
+  - empty.memory_format
+  - expand
+  - expand_copy
+  - narrow_copy
+  - view
+  - view_copy
+  - empty_strided
+  - new_empty_strided
 autograd:
   - max_pool3d
   - native_group_norm
diff --git a/aten/src/ATen/native/xnnpack/OpContext.cpp b/aten/src/ATen/native/xnnpack/OpContext.cpp
index 9f92a373c319..3fab362e6b63 100644
--- a/aten/src/ATen/native/xnnpack/OpContext.cpp
+++ b/aten/src/ATen/native/xnnpack/OpContext.cpp
@@ -133,10 +133,12 @@ XNNPackTransposeConv2dOpContext::create_context(at::Tensor&& weight,
 }
 
 Tensor XNNPackConv2dOpContext::run(const Tensor& input) {
+  std::lock_guard<std::mutex> lock(xnnp_mutex_);
   return xnnpack::internal::convolution2d::run(op_context_, input);
 }
 
 Tensor XNNPackTransposeConv2dOpContext::run(const Tensor& input) {
+  std::lock_guard<std::mutex> lock(xnnp_mutex_);
   return xnnpack::internal::convolution2d::run(op_context_, input);
 }
 
diff --git a/aten/src/ATen/native/xnnpack/OpContext.h b/aten/src/ATen/native/xnnpack/OpContext.h
index 3e6b7d81c037..dc2ea24c35d4 100644
--- a/aten/src/ATen/native/xnnpack/OpContext.h
+++ b/aten/src/ATen/native/xnnpack/OpContext.h
@@ -149,6 +149,13 @@ class TransposeConv2dOpContext : public torch::jit::CustomClassHolder {
 class XNNPackConv2dOpContext final : public Conv2dOpContext {
  private:
   ContextConv2D op_context_;
+  // xnnpack convs use indirection buffer.
+  // These buffers need setup at runtime and/or when input
+  // dims change. If we are running the same model on multiple
+  // threads, this can lead to contention where indirection buffer
+  // is being accessed and updated at the same time from two different
+  // threads.
+  std::mutex xnnp_mutex_;
 
  public:
   XNNPackConv2dOpContext(
@@ -190,6 +197,13 @@ class XNNPackConv2dOpContext final : public Conv2dOpContext {
 class XNNPackTransposeConv2dOpContext final : public TransposeConv2dOpContext {
  private:
   ContextConv2D op_context_;
+  // xnnpack convs use indirection buffer.
+  // These buffers need setup at runtime and/or when input
+  // dims change. If we are running the same model on multiple
+  // threads, this can lead to contention where indirection buffer
+  // is being accessed and updated at the same time from two different
+  // threads.
+  std::mutex xnnp_mutex_;
 
  public:
   XNNPackTransposeConv2dOpContext(
diff --git a/aten/src/ATen/templates/RegisterFunctionalization.cpp b/aten/src/ATen/templates/RegisterFunctionalization.cpp
index af3ed13de7ae..7b64cde1ad6e 100644
--- a/aten/src/ATen/templates/RegisterFunctionalization.cpp
+++ b/aten/src/ATen/templates/RegisterFunctionalization.cpp
@@ -38,7 +38,7 @@ constexpr auto exclude_keys_for_meta_dispatch =
 
 
 inline Tensor to_meta(const Tensor& t) {
-    return at::native::empty_strided_meta(t.sizes(), t.strides(),
+    return at::native::empty_strided_meta_symint(t.sym_sizes(), t.sym_strides(),
 /*dtype=*/c10::make_optional(t.scalar_type()), /*layout=*/c10::make_optional(t.layout()),
 /*device=*/c10::make_optional(c10::Device(kMeta)), /*pin_memory=*/c10::nullopt);
 }
diff --git a/aten/src/ATen/test/extension_backend_test.cpp b/aten/src/ATen/test/extension_backend_test.cpp
index a5c5868153f7..9af752a6e0ac 100644
--- a/aten/src/ATen/test/extension_backend_test.cpp
+++ b/aten/src/ATen/test/extension_backend_test.cpp
@@ -44,7 +44,7 @@ Tensor empty_strided_override(
   c10::optional<c10::Device> device,
   c10::optional<bool> pin_memory) {
 
-  return empty_override(SymIntArrayRef::fromIntArrayRef(size), dtype, layout, device, pin_memory, c10::nullopt);
+  return empty_override(fromIntArrayRef(size), dtype, layout, device, pin_memory, c10::nullopt);
 }
 
 TORCH_LIBRARY_IMPL(aten, ORT, m) {
diff --git a/aten/src/ATen/test/math_kernel_test.cpp b/aten/src/ATen/test/math_kernel_test.cpp
index 29c338899091..15ce0af4001d 100644
--- a/aten/src/ATen/test/math_kernel_test.cpp
+++ b/aten/src/ATen/test/math_kernel_test.cpp
@@ -119,7 +119,7 @@ TEST(MathKernelTest, NarrowCopy)  {
   for (const auto dim : c10::irange(3)) {
     const int64_t start = 1, length = 4;
     auto y_ref = x.narrow(dim, start, length);
-    auto y_test = at::native::narrow_copy_dense(x, dim, c10::SymInt(start), c10::SymInt(length));
+    auto y_test = at::native::narrow_copy_dense(x, dim, start, length);
     ASSERT_ALLCLOSE_TOLERANCES(y_ref, y_test, 0, 0);
   }
 }
diff --git a/aten/src/ATen/test/xnnpack_test.cpp b/aten/src/ATen/test/xnnpack_test.cpp
index a936273bbcec..d3acaefa7067 100644
--- a/aten/src/ATen/test/xnnpack_test.cpp
+++ b/aten/src/ATen/test/xnnpack_test.cpp
@@ -5,10 +5,15 @@
 
 #include <ATen/native/xnnpack/Common.h>
 #include <ATen/native/xnnpack/Engine.h>
+#include <ATen/native/xnnpack/OpContext.h>
 #include <ATen/native/xnnpack/Pooling.h>
 #include <c10/core/CPUAllocator.h>
 #include <c10/core/MemoryFormat.h>
 
+#include <atomic>
+#include <condition_variable>
+#include <thread>
+
 #if defined(C10_MOBILE) && defined(USE_XNNPACK)
 
 bool checkRtol(const at::Tensor& diff, const std::vector<at::Tensor> inputs) {
@@ -185,6 +190,92 @@ TEST(TestXNNPackOps, TestHardSwish) {
   }
 }
 
+TEST(TestXNNPackOps, TestConvolution2dMultiThreaded) {
+  constexpr int64_t groups = 1;
+
+  constexpr struct {
+    uint32_t batches;
+    uint32_t channels;
+    uint32_t width;
+    uint32_t height;
+
+    std::array<int64_t, 4u> size() const {
+      return {
+          batches,
+          channels,
+          width,
+          height,
+      };
+    }
+  } input{1, 3, 8, 8};
+
+  constexpr struct {
+    uint32_t output_channels;
+    uint32_t input_channels;
+    uint32_t width;
+    uint32_t height;
+
+    std::array<int64_t, 4u> size() const {
+      return {
+          output_channels,
+          input_channels,
+          width,
+          height,
+      };
+    }
+  } weights{1, input.channels, 3, 3};
+
+  const auto input_cpu =
+      at::randn(input.size(), at::device(at::kCPU).dtype(at::kFloat));
+  auto weights_cpu =
+      at::randn(weights.size(), at::device(at::kCPU).dtype(at::kFloat));
+  auto bias_cpu = at::randn(
+      {weights.output_channels}, at::device(at::kCPU).dtype(at::kFloat));
+
+  auto context = at::native::xnnpack::XNNPackConv2dOpContext::create_context(
+      std::move(weights_cpu), std::move(bias_cpu), {1, 1}, {2, 2}, {1, 1}, groups, c10::nullopt, c10::nullopt);
+  std::atomic<int64_t> count{0};
+  int64_t num_workers = 5;
+  std::mutex lock;
+  std::condition_variable cond;
+  auto sync_and_run_conv = [&](int64_t h, int64_t w) -> at::Tensor
+  {
+    auto input_tensor = at::randn({1, 3, h, w}, at::device(at::kCPU).dtype(at::kFloat));
+    int64_t count_val = ++count;
+    if (count_val < num_workers) {
+      std::unique_lock<std::mutex> g(lock);
+      while ((count_val = count.load()) < num_workers) {
+        cond.wait(g, [&]() {
+            auto new_val = count.load();
+            return new_val >= num_workers;});
+      }
+    } else {
+      std::unique_lock<std::mutex> g(lock);
+      cond.notify_all();
+    }
+    for (int64_t i = 0; i < 30; i++) {
+      context->run(input_tensor);
+    }
+    return context->run(input_tensor);
+  };
+
+  auto conv = [sync_and_run_conv](int64_t h, int64_t w) -> at::Tensor
+  {
+    return sync_and_run_conv(h, w);
+  };
+
+  std::thread t1(conv, 16, 16);
+  std::thread t2(conv, 12, 12);
+  std::thread t3(conv, 20, 20);
+  std::thread t4(conv, 22, 22);
+  std::thread t5(conv, 8, 8);
+  t1.join();
+  t2.join();
+  t3.join();
+  t4.join();
+  t5.join();
+}
+
 TEST(TestXNNPackOps, TestGlobal) {
   // input, expected_result pair
   std::vector<std::pair<at::Tensor, at::Tensor>> input_result_pairs = {
diff --git a/buckbuild.bzl b/buckbuild.bzl
index d4349d6a75a2..14c6e7d681d0 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -22,6 +22,7 @@ load(
     "jit_core_headers",
     "jit_core_sources",
     "libtorch_profiler_sources",
+    "torch_mobile_tracer_sources",
 )
 load(
     ":pt_ops.bzl",
@@ -507,6 +508,16 @@ def copy_template_registration_files(name, apple_sdks = None):
         apple_sdks = apple_sdks,
     )
 
+def get_feature_tracer_source_list():
+    """
+    Return just the Feature specific handlers used in the model tracer.
+    """
+    sources = []
+    for s in torch_mobile_tracer_sources:
+        if s.endswith("Tracer.cpp"):
+            sources.append(s)
+    return sources
+
 def pt_operator_query_codegen(
         name,
         deps = [],
@@ -807,6 +818,7 @@ def define_buck_targets(
             ("aten/src", "ATen/*.h"),
             ("aten/src", "ATen/cpu/**/*.h"),
             ("aten/src", "ATen/detail/*.h"),
+            ("aten/src", "ATen/functorch/**/*.h"),
             ("aten/src", "ATen/quantized/*.h"),
             ("aten/src", "ATen/vulkan/*.h"),
             ("aten/src", "ATen/metal/*.h"),
@@ -869,6 +881,7 @@ def define_buck_targets(
                 ("", "torch/custom_class_detail.h"),
                 # Add again due to namespace difference from aten_header.
                 ("", "aten/src/ATen/*.h"),
+                ("", "aten/src/ATen/functorch/**/*.h"),
                 ("", "aten/src/ATen/quantized/*.h"),
             ],
             exclude = [
@@ -1252,12 +1265,8 @@ def define_buck_targets(
     pt_xplat_cxx_library(
         name = "torch_model_tracer",
         srcs = [
-            "torch/csrc/jit/mobile/model_tracer/BuildFeatureTracer.cpp",
-            "torch/csrc/jit/mobile/model_tracer/CustomClassTracer.cpp",
-            "torch/csrc/jit/mobile/model_tracer/KernelDTypeTracer.cpp",
-            "torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.cpp",
             "torch/csrc/jit/mobile/model_tracer/TracerRunner.cpp",
-        ],
+        ] + get_feature_tracer_source_list(),
         header_namespace = "",
         compiler_flags = get_pt_compiler_flags(),
         exported_preprocessor_flags = get_pt_preprocessor_flags() + (["-DSYMBOLICATE_MOBILE_DEBUG_HANDLE"] if get_enable_eager_symbolication() else []),
diff --git a/build.bzl b/build.bzl
index f3b04a3f25a5..6d1bcf7a0601 100644
--- a/build.bzl
+++ b/build.bzl
@@ -257,6 +257,7 @@ _GENERATED_AUTOGRAD_PYTHON_CPP = [
     "torch/csrc/autograd/generated/python_functions_3.cpp",
     "torch/csrc/autograd/generated/python_functions_4.cpp",
     "torch/csrc/autograd/generated/python_nn_functions.cpp",
+    "torch/csrc/autograd/generated/python_nested_functions.cpp",
     "torch/csrc/autograd/generated/python_fft_functions.cpp",
     "torch/csrc/autograd/generated/python_linalg_functions.cpp",
     "torch/csrc/autograd/generated/python_return_types.cpp",
diff --git a/build_variables.bzl b/build_variables.bzl
index 85934cde4fc0..e77da88e8a80 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -140,6 +140,8 @@ libtorch_profiler_sources = [
     "torch/csrc/profiler/nvtx_observer.cpp",
     "torch/csrc/profiler/kineto_client_interface.cpp",
     "torch/csrc/profiler/itt_observer.cpp",
+    "torch/csrc/profiler/orchestration/observer.cpp",
+    "torch/csrc/profiler/orchestration/python_tracer.cpp",
     "torch/csrc/monitor/counters.cpp",
     "torch/csrc/monitor/events.cpp",
 ]
@@ -166,6 +168,7 @@ core_trainer_sources = [
     "torch/csrc/autograd/saved_variable.cpp",
     "torch/csrc/autograd/variable.cpp",
     "torch/csrc/autograd/utils/warnings.cpp",
+    "torch/csrc/autograd/jit_decomp_interface.cpp",
     "torch/csrc/jit/frontend/name_mangler.cpp",
     "torch/csrc/jit/ir/type_hashing.cpp",
     "torch/csrc/jit/serialization/pickler.cpp",
@@ -483,6 +486,7 @@ libtorch_distributed_base_sources = [
     "torch/csrc/distributed/c10d/reducer.cpp",
     "torch/csrc/distributed/c10d/sequence_num.cpp",
     "torch/csrc/distributed/c10d/socket.cpp",
+    "torch/csrc/distributed/c10d/Work.cpp",
 ]
 
 # These files are only supported on Linux (and others) but not on Windows.
@@ -553,6 +557,8 @@ torch_mobile_tracer_sources = [
     "torch/csrc/jit/mobile/model_tracer/MobileModelRunner.cpp",
     "torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.cpp",
     "torch/csrc/jit/mobile/model_tracer/KernelDTypeTracer.cpp",
+    "torch/csrc/jit/mobile/model_tracer/CustomClassTracer.cpp",
+    "torch/csrc/jit/mobile/model_tracer/BuildFeatureTracer.cpp",
 ]
 
 torch_mobile_core = [
@@ -727,6 +733,9 @@ libtorch_cuda_core_sources = [
     "torch/csrc/jit/codegen/cuda/partial_split_map.cpp",
     "torch/csrc/jit/codegen/cuda/partition.cpp",
     "torch/csrc/jit/codegen/cuda/predicate_compute.cpp",
+    "torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.cpp",
+    "torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.cpp",
+    "torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.cpp",
     "torch/csrc/jit/codegen/cuda/register_interface.cpp",
     "torch/csrc/jit/codegen/cuda/root_domain_map.cpp",
     "torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp",
@@ -889,7 +898,6 @@ libtorch_python_core_sources = [
     "torch/csrc/autograd/python_variable_indexing.cpp",
     "torch/csrc/jit/backends/backend_init.cpp",
     "torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp",
-    "torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.cpp",
     "torch/csrc/jit/python/init.cpp",
     "torch/csrc/jit/passes/onnx.cpp",
     "torch/csrc/jit/passes/onnx/cast_all_constant_to_floating.cpp",
@@ -984,6 +992,7 @@ def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):
         "torch/csrc/autograd/generated/python_functions_2.cpp",
         "torch/csrc/autograd/generated/python_functions_3.cpp",
         "torch/csrc/autograd/generated/python_functions_4.cpp",
+        "torch/csrc/autograd/generated/python_nested_functions.cpp",
         "torch/csrc/autograd/generated/python_nn_functions.cpp",
         "torch/csrc/autograd/generated/python_fft_functions.cpp",
         "torch/csrc/autograd/generated/python_linalg_functions.cpp",
diff --git a/c10/c10_defs.bzl b/c10/c10_defs.bzl
deleted file mode 100644
index 55fb9fc35e5d..000000000000
--- a/c10/c10_defs.bzl
+++ /dev/null
@@ -1,29 +0,0 @@
-load("@fbsource//tools/build_defs:expect.bzl", "expect")
-load(
-    "@fbsource//tools/build_defs/apple:build_mode_defs.bzl",
-    "is_production_build",
-)
-
-###############################################################################
-# Check if we need to strip glog.
-def _get_strip_glog_config():
-    c2_strip_glog = native.read_config("caffe2", "strip_glog", "1")
-    expect(
-        c2_strip_glog in ("0", "1"),
-        c2_strip_glog,
-    )
-    return bool(int(c2_strip_glog))
-
-# For iOS production builds (and all Android builds), strip GLOG logging to
-# save size. We can disable by setting caffe2.strip_glog=0 in .buckconfig.local.
-def get_fbobjc_strip_glog_flags():
-    if is_production_build() or _get_strip_glog_config():
-        return ["-UGOOGLE_STRIP_LOG", "-DGOOGLE_STRIP_LOG=3"]
-    else:
-        return ["-UGOOGLE_STRIP_LOG"]
-
-def get_fbandroid_strip_glog_flags():
-    if _get_strip_glog_config():
-        return ["-UGOOGLE_STRIP_LOG", "-DGOOGLE_STRIP_LOG=1"]
-    else:
-        return []
diff --git a/c10/core/DispatchKey.cpp b/c10/core/DispatchKey.cpp
index 343262f7b72d..1da32dcc9b20 100644
--- a/c10/core/DispatchKey.cpp
+++ b/c10/core/DispatchKey.cpp
@@ -172,6 +172,9 @@ const char* toString(DispatchKey t) {
     case DispatchKey::TESTING_ONLY_GenericMode:
       return "TESTING_ONLY_GenericMode";
 
+    case DispatchKey::PythonDispatcher:
+      return "PythonDispatcher";
+
       // Aliases
 
     case DispatchKey::Autograd:
@@ -264,6 +267,7 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
       {"ZeroTensor", c10::DispatchKey::ZeroTensor},
       {"FuncTorchDynamicLayerBackMode",
        c10::DispatchKey::FuncTorchDynamicLayerBackMode},
+      {"Functionalize", c10::DispatchKey::Functionalize},
       {"ADInplaceOrView", c10::DispatchKey::ADInplaceOrView},
       {"AutogradOther", c10::DispatchKey::AutogradOther},
       {"AutogradFunctionality", c10::DispatchKey::AutogradFunctionality},
@@ -283,6 +287,7 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
       {"TESTING_ONLY_GenericWrapper",
        c10::DispatchKey::TESTING_ONLY_GenericWrapper},
       {"TESTING_ONLY_GenericMode", c10::DispatchKey::TESTING_ONLY_GenericMode},
+      {"PythonDispatcher", c10::DispatchKey::PythonDispatcher},
 
       {"CPU", c10::DispatchKey::CPU},
       {"CUDA", c10::DispatchKey::CUDA},
diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h
index 8843cd5472d7..536a15a24a0b 100644
--- a/c10/core/DispatchKey.h
+++ b/c10/core/DispatchKey.h
@@ -401,6 +401,10 @@ enum class DispatchKey : uint16_t {
   // for a usage example
   TESTING_ONLY_GenericMode,
 
+  // This is a bypass that allows you to skip running the C++ dispatcher
+  // entirely
+  PythonDispatcher,
+
   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FIN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
   EndOfFunctionalityKeys, // End of functionality keys.
 
diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h
index d3d90693b906..cf07bb316113 100644
--- a/c10/core/DispatchKeySet.h
+++ b/c10/core/DispatchKeySet.h
@@ -172,7 +172,9 @@ class DispatchKeySet final {
             (1ULL
              << (num_backends + static_cast<uint8_t>(toFunctionalityKey(t)) -
                  1)) -
-            1) {}
+            1) {
+    *this = add(DispatchKey::PythonDispatcher);
+  }
 
   // Public version of DispatchKeySet(uint64_t) API; external users
   // must be explicit when they do this!
diff --git a/c10/core/PyHandleCache.h b/c10/core/PyHandleCache.h
new file mode 100644
index 000000000000..351c038132a2
--- /dev/null
+++ b/c10/core/PyHandleCache.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <c10/core/impl/PyInterpreter.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/python_stub.h>
+
+#include <atomic>
+
+namespace c10 {
+
+// A PyHandleCache represents a cached pointer from a C++ object to
+// a Python object that represents that object analogously in Python.
+// Upon a cache hit, the relevant object can be retrieved after a test
+// and then a memory load.  Two conditions must hold to be able to use this
+// class:
+//
+//  - This must truly be a cache; e.g., the caller must be able to produce
+//    the object some other way if the cache hit misses.
+//
+//  - This must truly be a handle; e.g., the Python object referenced by
+//    this class must have static lifetime.  This means we don't have to
+//    maintain strong ownership or deallocate the object when the C++ object
+//    dies.  Static lifetime is a good idea in conjunction with the cache,
+//    since if you are producing a fresh object on miss you won't be
+//    maintaining object identity.  If you need bidirectional ownership,
+//    you will want to factor out the pattern in TensorImpl with
+//    resurrection.
+//
+// This cache is expected to not improve perf under torchdeploy, as one
+// interpreter will fill up the cache, and all the interpreters will be
+// unable to use the slot.  A potential improvement is to have multiple
+// slots (one per interpreter), which will work in deployment scenarios
+// where there a stable, fixed number of interpreters.  You can also store
+// the relevant state in the Python library, rather than in the non-Python
+// library (although in many cases, this is not convenient, as there may
+// not be a way to conveniently index based on the object.)
+class PyHandleCache {
+ public:
+  PyHandleCache() : pyinterpreter_(nullptr), data_(nullptr) {}
+
+  // Attempt to fetch the pointer from the cache, if the PyInterpreter
+  // matches.  If it doesn't exist, or the cache entry is not valid,
+  // use slow_accessor to get the real pointer value and return that
+  // (possibly writing it to the cache, if the cache entry is
+  // available.)
+  template <typename F>
+  PyObject* ptr_or(impl::PyInterpreter* self_interpreter, F slow_accessor)
+      const {
+    // Note [Memory ordering on Python interpreter tag]
+    impl::PyInterpreter* interpreter =
+        pyinterpreter_.load(std::memory_order_acquire);
+    if (C10_LIKELY(interpreter == self_interpreter)) {
+      return data_;
+    } else if (interpreter == nullptr) {
+      auto* r = slow_accessor();
+      impl::PyInterpreter* expected = nullptr;
+      // attempt to claim this cache entry with the specified interpreter tag
+      if (pyinterpreter_.compare_exchange_strong(
+              expected, self_interpreter, std::memory_order_acq_rel)) {
+        data_ = r;
+      }
+      // This shouldn't be possible, as you should be GIL protected
+      TORCH_INTERNAL_ASSERT(expected != self_interpreter);
+      return r;
+    } else {
+      return slow_accessor();
+    }
+  }
+
+ private:
+  mutable std::atomic<impl::PyInterpreter*> pyinterpreter_;
+  mutable PyObject* data_;
+};
+
+} // namespace c10
diff --git a/c10/core/SafePyObject.cpp b/c10/core/SafePyObject.cpp
index d8c3da49ffb1..09c20e24df11 100644
--- a/c10/core/SafePyObject.cpp
+++ b/c10/core/SafePyObject.cpp
@@ -8,4 +8,9 @@ PyObject* SafePyObject::ptr(const c10::impl::PyInterpreter* interpreter) const {
   return data_;
 }
 
+PyObject* SafePyHandle::ptr(const c10::impl::PyInterpreter* interpreter) const {
+  TORCH_INTERNAL_ASSERT(interpreter == pyinterpreter_);
+  return data_;
+}
+
 } // namespace c10
diff --git a/c10/core/SafePyObject.h b/c10/core/SafePyObject.h
index 313684e4804e..f9ecb9c4de6d 100644
--- a/c10/core/SafePyObject.h
+++ b/c10/core/SafePyObject.h
@@ -42,4 +42,29 @@ struct C10_API SafePyObject {
   c10::impl::PyInterpreter* pyinterpreter_;
 };
 
+// Like SafePyObject, but non-owning.  Good for references to global PyObjects
+// that will be leaked on interpreter exit.  You get a copy constructor/assign
+// this way.
+struct C10_API SafePyHandle {
+  SafePyHandle() : data_(nullptr), pyinterpreter_(nullptr) {}
+  SafePyHandle(PyObject* data, c10::impl::PyInterpreter* pyinterpreter)
+      : data_(data), pyinterpreter_(pyinterpreter) {}
+
+  c10::impl::PyInterpreter& pyinterpreter() const {
+    return *pyinterpreter_;
+  }
+  PyObject* ptr(const c10::impl::PyInterpreter*) const;
+  void reset() {
+    data_ = nullptr;
+    pyinterpreter_ = nullptr;
+  }
+  operator bool() {
+    return data_;
+  }
+
+ private:
+  PyObject* data_;
+  c10::impl::PyInterpreter* pyinterpreter_;
+};
+
 } // namespace c10
diff --git a/c10/core/SymInt.cpp b/c10/core/SymInt.cpp
index 944bc6722add..ccabf27168b4 100644
--- a/c10/core/SymInt.cpp
+++ b/c10/core/SymInt.cpp
@@ -52,6 +52,14 @@ static std::array<SymIntNode, 2> normalize_symints(SymInt a_, SymInt b_) {
 }
 #endif
 
+int64_t SymInt::guard_int(const char* file, int64_t line) const {
+  if (!is_symbolic()) {
+    return data_;
+  }
+  SymIntNode a = toSymIntNodeImpl();
+  return a->guard_int(file, line);
+}
+
 SymInt SymInt::operator+(SymInt sci) const {
   if (!is_symbolic() && !sci.is_symbolic()) {
     return SymInt(data_ + sci.data_);
@@ -140,6 +148,10 @@ void SymInt::operator*=(SymInt sci) {
   *this = *this * sci;
 }
 
+void SymInt::operator+=(SymInt sci) {
+  *this = *this + sci;
+}
+
 bool SymInt::operator<(int64_t sci) const {
   return *this < c10::SymInt(sci);
 }
diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h
index 3fac75488169..4619e87646d6 100644
--- a/c10/core/SymInt.h
+++ b/c10/core/SymInt.h
@@ -6,6 +6,7 @@
 #include <c10/util/intrusive_ptr.h>
 
 #include <memory>
+#include <numeric>
 
 namespace c10 {
 
@@ -112,11 +113,26 @@ class C10_API SymInt {
     release_();
   }
 
+  // Require the int to be non-symbolic, and if it is symbolic raise an
+  // error.  This is safe to use for C++ code that doesn't work for symbolic
+  // shapes, and you don't have time to fix it immediately, as if we
+  // try to trigger the path in C++ you'll appropriately get an error
   int64_t expect_int() const {
     SKIP_IS_SYMBOLIC_ON_MOBILE(!is_symbolic());
     return data_;
   }
 
+  // Insert a guard for the int to be its concrete value, and then return
+  // that value.  This operation always works, even if the int is symbolic,
+  // so long as we know what the underlying value is (e.g., this won't work
+  // if you call it on the size of nonzero output).  Don't blindly put this
+  // everywhere; you can cause overspecialization of PyTorch programs with
+  // this method.
+  //
+  // It should be called as guard_int(__FILE__, __LINE__).  The file and line
+  // number can be used to diagnose overspecialization.
+  int64_t guard_int(const char* file, int64_t line) const;
+
   // N.B. It's important to keep this definition in the header
   // as we expect if checks to be folded for mobile builds
   // where `is_symbolic` is always false
@@ -140,6 +156,7 @@ class C10_API SymInt {
   bool operator>(SymInt sci) const;
   bool operator>=(SymInt sci) const;
   void operator*=(SymInt sci);
+  void operator+=(SymInt sci);
 
   SymInt operator*(int64_t sci) const;
   bool operator<(int64_t sci) const;
@@ -185,5 +202,19 @@ class C10_API SymInt {
 
 #undef SKIP_IS_SYMBOLIC_ON_MOBILE
 
+/// Sum of a list of SymInt; accumulates into the c10::SymInt expression
+template <
+    typename C,
+    typename std::enable_if<
+        std::is_same<typename C::value_type, c10::SymInt>::value,
+        int>::type = 0>
+inline c10::SymInt multiply_integers(const C& container) {
+  return std::accumulate(
+      container.begin(),
+      container.end(),
+      c10::SymInt(1),
+      [](c10::SymInt a, c10::SymInt b) { return a * b; });
+}
+
 C10_API std::ostream& operator<<(std::ostream& os, SymInt s);
 } // namespace c10
diff --git a/c10/core/SymIntArrayRef.cpp b/c10/core/SymIntArrayRef.cpp
index 44a419f4a9f4..6e248c349cc5 100644
--- a/c10/core/SymIntArrayRef.cpp
+++ b/c10/core/SymIntArrayRef.cpp
@@ -28,7 +28,11 @@ at::IntArrayRef asIntArrayRefUnchecked(c10::SymIntArrayRef ar) {
 
 // TODO: this print is bad
 std::ostream& operator<<(std::ostream& os, SymInt s) {
-  os << "SymInt(" << s.as_int_unchecked() << ")";
+  if (s.is_symbolic()) {
+    os << "SymInt(" << s.toSymIntNodeImpl()->str() << ")";
+  } else {
+    os << "SymInt(" << s.as_int_unchecked() << ")";
+  }
   return os;
 }
 
diff --git a/c10/core/SymIntArrayRef.h b/c10/core/SymIntArrayRef.h
index 7a2efc7abb4a..f39f2ac2af81 100644
--- a/c10/core/SymIntArrayRef.h
+++ b/c10/core/SymIntArrayRef.h
@@ -1,15 +1,3 @@
-// This file defines `SymIntArrayRef` which serves as the view onto
-// std::vector<SymInt>. This class is conceptually and mostly functionally
-// equivalent to ArrayRef<SymInt>.
-//
-// However, ArrayRef<SymInt> can't be used directly as it introduces ambiguity
-// in the following cases:
-//   - a.expand({1, 2, 3}) matches two overloads:
-//       1. `at::Tensor Tensor::expand(c10::SymIntArrayRef size, bool implicit)`
-//       2. `at::Tensor Tensor::expand(at::IntArrayRef size, bool implicit)`
-// Introducing `SymIntArrayRef` allows to have a finer-grained control over
-// which overload will be used.
-
 #pragma once
 
 #include <c10/core/SymInt.h>
@@ -23,196 +11,33 @@
 #include <vector>
 
 namespace c10 {
-/// SymIntArrayRef - Represent a constant reference to an array (0 or more
-/// elements consecutively in memory), i.e. a start pointer and a length.  It
-/// allows various APIs to take consecutive elements easily and conveniently.
-///
-/// This class does not own the underlying data, it is expected to be used in
-/// situations where the data resides in some other buffer, whose lifetime
-/// extends past that of the SymIntArrayRef. For this reason, it is not in
-/// general safe to store an SymIntArrayRef.
-///
-/// This is intended to be trivially copyable, so it should be passed by
-/// value.
-
-class SymIntArrayRef final {
- public:
-  using iterator = const c10::SymInt*;
-  using const_iterator = const c10::SymInt*;
-  using size_type = size_t;
-  using value_type = c10::SymInt;
-
-  using reverse_iterator = std::reverse_iterator<iterator>;
-
- private:
-  ArrayRef<c10::SymInt> wrapped_symint_array_ref;
-
- public:
-  /// @name Constructors
-  /// @{
-
-  /// Construct an empty SymIntArrayRef.
-  /* implicit */ constexpr SymIntArrayRef() {}
-
-  /* implicit */ SymIntArrayRef(const std::vector<c10::SymInt>& Vec)
-      : wrapped_symint_array_ref(Vec) {}
-
-  /// Construct an SymIntArrayRef from a pointer and length.
-  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA SymIntArrayRef(
-      const c10::SymInt* data,
-      size_t length)
-      : wrapped_symint_array_ref(data, length) {}
-
-  template <typename U>
-  /* implicit */ SymIntArrayRef(
-      const SmallVectorTemplateCommon<c10::SymInt, U>& Vec)
-      : wrapped_symint_array_ref(Vec) {}
-
-  /// Construct an SymIntArrayRef from a range.
-  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA SymIntArrayRef(
-      const c10::SymInt* begin,
-      const c10::SymInt* end)
-      : wrapped_symint_array_ref(begin, end) {}
-
-  /// Construct an SymIntArrayRef from a C array.
-  template <size_t N>
-  /* implicit */ constexpr SymIntArrayRef(const c10::SymInt (&Arr)[N])
-      : wrapped_symint_array_ref(Arr) {}
-
-  // Prefer using a more semantic constructor, like
-  // fromIntArrayRefKnownNonNegative
-  static SymIntArrayRef fromIntArrayRefUnchecked(IntArrayRef array_ref) {
-    return SymIntArrayRef(
-        reinterpret_cast<const SymInt*>(array_ref.data()), array_ref.size());
-  }
-
-  static SymIntArrayRef fromIntArrayRefKnownNonNegative(IntArrayRef array_ref) {
-    return fromIntArrayRefUnchecked(array_ref);
-  }
-
-  static SymIntArrayRef fromIntArrayRef(IntArrayRef array_ref) {
-    for (size_t i = 0; i < array_ref.size(); ++i) {
-      TORCH_CHECK(
-          SymInt::check_range(array_ref[i]),
-          "IntArrayRef contains an int that cannot be represented as a SymInt: ",
-          array_ref[i]);
-    }
-    return SymIntArrayRef(
-        reinterpret_cast<const SymInt*>(array_ref.data()), array_ref.size());
-  }
-
-  /// @}
-  /// @name Simple Operations
-  /// @{
-
-  constexpr iterator begin() const {
-    return wrapped_symint_array_ref.begin();
-  }
-  constexpr iterator end() const {
-    return wrapped_symint_array_ref.end();
-  }
-
-  // These are actually the same as iterator, since SymIntArrayRef only
-  // gives you const iterators.
-  constexpr const_iterator cbegin() const {
-    return wrapped_symint_array_ref.cbegin();
-  }
-  constexpr const_iterator cend() const {
-    return wrapped_symint_array_ref.cend();
-  }
-
-  /// empty - Check if the array is empty.
-  constexpr bool empty() const {
-    return size() == 0;
-  }
-
-  constexpr const c10::SymInt* data() const {
-    return wrapped_symint_array_ref.data();
-  }
-
-  /// size - Get the array size.
-  constexpr size_t size() const {
-    return wrapped_symint_array_ref.size();
-  }
-
-  /// front - Get the first element.
-  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA const c10::SymInt& front() const {
-    return wrapped_symint_array_ref.front();
-  }
-
-  /// back - Get the last element.
-  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA const c10::SymInt& back() const {
-    return wrapped_symint_array_ref.back();
-  }
-
-  /// equals - Check for element-wise equality.
-  constexpr bool equals(SymIntArrayRef RHS) const {
-    return this->wrapped_symint_array_ref.equals(RHS.wrapped_symint_array_ref);
-  }
-
-  /// slice(n, m) - Take M elements of the array starting at element N
-  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA SymIntArrayRef
-  slice(size_t N, size_t M) const {
-    return SymIntArrayRef(wrapped_symint_array_ref.data() + N, M);
-  }
-
-  /// slice(n) - Chop off the first N elements of the array.
-  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA SymIntArrayRef slice(size_t N) const {
-    return slice(N, size() - N);
-  }
-
-  /// @}
-  /// @name Operator Overloads
-  /// @{
-  constexpr const c10::SymInt& operator[](size_t Index) const {
-    return wrapped_symint_array_ref[Index];
-  }
-
-  /// Vector compatibility
-  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA const c10::SymInt& at(size_t Index) const {
-    return wrapped_symint_array_ref.at(Index);
-  }
-
-  /// Disallow accidental assignment from a temporary.
-  ///
-  /// The declaration here is extra complicated so that "arrayRef = {}"
-  /// continues to select the move assignment operator.
-  template <typename U>
-  typename std::enable_if<std::is_same<U, c10::SymInt>::value, SymIntArrayRef>::
-      type&
-      operator=(U&& Temporary) = delete;
-
-  /// Disallow accidental assignment from a temporary.
-  ///
-  /// The declaration here is extra complicated so that "arrayRef = {}"
-  /// continues to select the move assignment operator.
-  template <typename U>
-  typename std::enable_if<std::is_same<U, c10::SymInt>::value, SymIntArrayRef>::
-      type&
-      operator=(std::initializer_list<U>) = delete;
-
-  /// @}
-  /// @name Expensive Operations
-  /// @{
-  std::vector<c10::SymInt> vec() const {
-    return wrapped_symint_array_ref.vec();
-  }
-
-  friend std::ostream& operator<<(
-      std::ostream& out,
-      const SymIntArrayRef& list);
-  /// @}
-};
+using SymIntArrayRef = ArrayRef<SymInt>;
 
 TORCH_API at::IntArrayRef asIntArrayRefSlow(c10::SymIntArrayRef ar);
 TORCH_API at::IntArrayRef asIntArrayRefUnchecked(c10::SymIntArrayRef ar);
 TORCH_API c10::optional<at::IntArrayRef> asIntArrayRefSlowOpt(
     c10::SymIntArrayRef ar);
 
-inline std::ostream& operator<<(
-    std::ostream& out,
-    const c10::SymIntArrayRef& list) {
-  return out << list.wrapped_symint_array_ref;
+// Prefer using a more semantic constructor, like
+// fromIntArrayRefKnownNonNegative
+inline SymIntArrayRef fromIntArrayRefUnchecked(IntArrayRef array_ref) {
+  return SymIntArrayRef(
+      reinterpret_cast<const SymInt*>(array_ref.data()), array_ref.size());
+}
+
+inline SymIntArrayRef fromIntArrayRefKnownNonNegative(IntArrayRef array_ref) {
+  return fromIntArrayRefUnchecked(array_ref);
+}
+
+inline SymIntArrayRef fromIntArrayRef(IntArrayRef array_ref) {
+  for (size_t i = 0; i < array_ref.size(); ++i) {
+    TORCH_CHECK(
+        SymInt::check_range(array_ref[i]),
+        "IntArrayRef contains an int that cannot be represented as a SymInt: ",
+        array_ref[i]);
+  }
+  return SymIntArrayRef(
+      reinterpret_cast<const SymInt*>(array_ref.data()), array_ref.size());
 }
 
 } // namespace c10
diff --git a/c10/core/SymIntNodeImpl.h b/c10/core/SymIntNodeImpl.h
index da4beaeae7dc..a74bec269029 100644
--- a/c10/core/SymIntNodeImpl.h
+++ b/c10/core/SymIntNodeImpl.h
@@ -63,12 +63,15 @@ class C10_API SymIntNodeImpl : public c10::intrusive_ptr_target {
   virtual SymIntNode wrap(int64_t num) {
     TORCH_CHECK(false, "NYI");
   };
-  virtual bool bool_() {
+  virtual int64_t guard_int(const char* file, int64_t line) {
     TORCH_CHECK(false, "NYI");
   };
   virtual int64_t int_() {
     TORCH_CHECK(false, "NYI");
-  }
+  };
+  virtual bool bool_() {
+    TORCH_CHECK(false, "NYI");
+  };
   virtual std::string str() {
     TORCH_CHECK(false, "NYI");
   };
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index c5de83b2b765..e6bd494d7dde 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -389,83 +389,93 @@ impl::PyInterpreter& TensorImpl::load_pyobj_interpreter() const {
 }
 
 bool TensorImpl::is_contiguous_custom(at::MemoryFormat memory_format) const {
-  if (is_python_dispatch()) {
+  if (C10_UNLIKELY(matches_python_custom(SizesStridesPolicy::CustomStrides))) {
+    // TODO: pass memory_format to is_contiguous call
     return load_pyobj_interpreter()->is_contiguous(this);
   }
-  TORCH_CHECK(
-      false,
-      "Tensors of type ",
-      tensorimpl_type_name(),
-      " do not have is_contiguous");
+  return is_contiguous_default(memory_format);
 }
 
 IntArrayRef TensorImpl::sizes_custom() const {
-  if (is_python_dispatch()) {
+  if (C10_UNLIKELY(matches_python_custom(SizesStridesPolicy::CustomSizes))) {
     return load_pyobj_interpreter()->sizes(this);
   }
-  TORCH_CHECK(
-      false, "Tensors of type ", tensorimpl_type_name(), " do not have sizes");
+  return sizes_default();
 }
 
 c10::SymIntArrayRef TensorImpl::sym_sizes_custom() const {
-  if (C10_UNLIKELY(is_python_dispatch())) {
+  if (C10_UNLIKELY(matches_python_custom(SizesStridesPolicy::CustomSizes))) {
     return load_pyobj_interpreter()->sym_sizes(this);
   }
   return sym_sizes_default();
 }
 
 c10::SymInt TensorImpl::sym_numel_custom() const {
-  if (C10_UNLIKELY(is_python_dispatch())) {
+  if (C10_UNLIKELY(matches_python_custom(SizesStridesPolicy::CustomSizes))) {
     return load_pyobj_interpreter()->sym_numel(this);
   }
   return sym_numel_default();
 }
 
 c10::SymIntArrayRef TensorImpl::sym_strides_custom() const {
-  if (C10_UNLIKELY(is_python_dispatch())) {
+  if (C10_UNLIKELY(matches_python_custom(SizesStridesPolicy::CustomStrides))) {
     return load_pyobj_interpreter()->sym_strides(this);
   }
   return sym_strides_default();
 }
 
 c10::Device TensorImpl::device_custom() const {
-  if (is_python_dispatch()) {
+  if (C10_UNLIKELY(python_custom_device_)) {
     return load_pyobj_interpreter()->device(this);
   }
-  TORCH_CHECK(
-      false, "Tensors of type ", tensorimpl_type_name(), " do not have device");
+  return device_default();
 }
 
 IntArrayRef TensorImpl::strides_custom() const {
-  if (is_python_dispatch()) {
+  if (C10_UNLIKELY(matches_python_custom(SizesStridesPolicy::CustomStrides))) {
     return load_pyobj_interpreter()->strides(this);
   }
-  TORCH_CHECK(
-      false,
-      "Tensors of type ",
-      tensorimpl_type_name(),
-      " do not have strides");
+  return strides_default();
 }
 
 int64_t TensorImpl::dim_custom() const {
-  if (is_python_dispatch()) {
+  if (C10_UNLIKELY(matches_python_custom(SizesStridesPolicy::CustomSizes))) {
     return load_pyobj_interpreter()->dim(this);
   }
-  TORCH_CHECK(
-      false, "Tensors of type ", tensorimpl_type_name(), " do not have dim");
+  return dim_default();
 }
 
 int64_t TensorImpl::numel_custom() const {
-  TORCH_CHECK(
-      false, "Tensors of type ", tensorimpl_type_name(), " do not have numel");
+  if (C10_UNLIKELY(matches_python_custom(SizesStridesPolicy::CustomSizes))) {
+    // TODO: fix this
+    return load_pyobj_interpreter()->sym_numel(this).expect_int();
+  }
+  return numel_default();
 }
 
 c10::Layout TensorImpl::layout_custom() const {
-  if (is_python_dispatch()) {
+  if (C10_UNLIKELY(python_custom_layout_)) {
     return load_pyobj_interpreter()->layout(this);
   }
+  // TODO: fix this
   TORCH_CHECK(
-      false, "Tensors of type ", tensorimpl_type_name(), " do not have layout");
+      0, "Tensors of type ", tensorimpl_type_name(), " do not have layout")
+  // return layout_default();
+}
+
+int64_t TensorImpl::storage_offset_custom() const {
+  if (C10_UNLIKELY(matches_python_custom(SizesStridesPolicy::CustomSizes))) {
+    // TODO: fix this
+    return load_pyobj_interpreter()->sym_storage_offset(this).expect_int();
+  }
+  return storage_offset_default();
+}
+
+c10::SymInt TensorImpl::sym_storage_offset_custom() const {
+  if (C10_UNLIKELY(matches_python_custom(SizesStridesPolicy::CustomSizes))) {
+    return load_pyobj_interpreter()->sym_storage_offset(this);
+  }
+  return sym_storage_offset_default();
 }
 
 static void deletePlacementDeleteContext(void* ptr) {
@@ -623,7 +633,15 @@ void TensorImpl::copy_generic_tensor_metadata(
   if (src_impl->extra_meta_ != nullptr) {
     dest_impl->extra_meta_ = src_impl->extra_meta_->clone();
   }
-  dest_impl->sizes_strides_policy_ = src_impl->sizes_strides_policy_;
+
+  // NB: symbolic sizes and strides are copied, but custom policy is
+  // NOT (you have no Python object to dispatch to!)
+  // NB: subclass relevant policy doesn't have to be copied; the
+  // constructor sets this up
+
+  dest_impl->refresh_sizes_strides_policy();
+  dest_impl->refresh_layout_policy();
+  dest_impl->refresh_device_policy();
 }
 
 void TensorImpl::copy_tensor_metadata_except_version_counter(
@@ -867,22 +885,70 @@ void TensorImpl::ShareExternalPointer(
   }
 }
 
-void TensorImpl::set_sym_sizes_and_strides(
+bool _compute_contiguous(const ExtraMeta& extra_meta, std::vector<int> order) {
+  if (order.size() != extra_meta.sizes_.size())
+    return false;
+  bool is_contiguous = true;
+  if (extra_meta.numel_ == 0)
+    return is_contiguous;
+  SymInt z = 1;
+  for (auto d : order) {
+    const auto size_d = extra_meta.sizes_.at(d);
+    if (size_d != 1) {
+      if (extra_meta.strides_.at(d) == z) {
+        z *= size_d;
+      } else {
+        is_contiguous = false;
+        break;
+      }
+    }
+  }
+  return is_contiguous;
+}
+
+bool _compute_contiguous(const ExtraMeta& extra_meta) {
+  std::vector<int> order(extra_meta.sizes_.size());
+  std::iota(order.rbegin(), order.rend(), 0);
+  return _compute_contiguous(extra_meta, order);
+}
+
+void TensorImpl::set_sizes_and_strides(
     c10::SymIntArrayRef sizes,
-    c10::SymIntArrayRef strides) {
+    c10::SymIntArrayRef strides,
+    c10::optional<c10::SymInt> storage_offset) {
+  auto int_sizes = asIntArrayRefSlowOpt(sizes);
+  auto int_strides = asIntArrayRefSlowOpt(strides);
+  if (int_sizes && int_strides &&
+      (!storage_offset.has_value() || !storage_offset->is_symbolic()) &&
+      !has_symbolic_sizes_strides_) {
+    set_sizes_and_strides(*int_sizes, *int_strides);
+    if (storage_offset.has_value())
+      set_storage_offset(storage_offset->as_int_unchecked());
+    return;
+  }
+
   has_symbolic_sizes_strides_ = true;
-  sizes_strides_policy_ = static_cast<uint8_t>(SizesStridesPolicy::CustomSizes);
+  refresh_sizes_strides_policy();
   if (!extra_meta_) {
     extra_meta_ = std::make_unique<ExtraMeta>();
+    if (!storage_offset.has_value()) {
+      extra_meta_->storage_offset_ = storage_offset_;
+    }
   }
   extra_meta_->sizes_ = sizes;
   extra_meta_->strides_ = strides;
+  if (storage_offset.has_value())
+    extra_meta_->storage_offset_ = std::move(*storage_offset);
   SymInt numel = 1;
   for (const auto& s : sizes) {
     numel *= s;
   }
   extra_meta_->numel_ = numel;
-  // TODO: refresh the other entries
+  extra_meta_->is_contiguous_ = _compute_contiguous(*extra_meta_);
+  extra_meta_->is_channels_last_contiguous_ =
+      _compute_contiguous(*extra_meta_, {1, 3, 2, 0});
+  extra_meta_->is_channels_last_3d_contiguous_ =
+      _compute_contiguous(*extra_meta_, {1, 4, 3, 2, 0});
 }
 
 namespace impl {
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 0366e2cfca68..a614fa37a14a 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -229,7 +229,10 @@ struct C10_API ExtraMeta {
   SymDimVector sizes_ = {0};
   SymDimVector strides_ = {1};
   SymInt numel_ = 1;
-  SymInt storage_offset_ = 0; // TODO
+  SymInt storage_offset_ = 0;
+  bool is_contiguous_ = true;
+  bool is_channels_last_contiguous_ = false;
+  bool is_channels_last_3d_contiguous_ = false;
   // TODO:
   // SymBool is_contiguous_;
   std::unique_ptr<c10::NamedTensorMetaInterface> named_tensor_meta_ = nullptr;
@@ -241,11 +244,17 @@ struct C10_API ExtraMeta {
       SymDimVector strides,
       SymInt numel,
       SymInt storage_offset,
+      bool is_contiguous,
+      bool is_channels_last_contiguous,
+      bool is_channels_last_3d_contiguous,
       std::unique_ptr<c10::NamedTensorMetaInterface> named_tensor_meta)
       : sizes_(std::move(sizes)),
         strides_(std::move(strides)),
         numel_(std::move(numel)),
         storage_offset_(std::move(storage_offset)),
+        is_contiguous_(is_contiguous),
+        is_channels_last_contiguous_(is_channels_last_contiguous),
+        is_channels_last_3d_contiguous_(is_channels_last_3d_contiguous),
         named_tensor_meta_(std::move(named_tensor_meta)) {}
 
   std::unique_ptr<ExtraMeta> clone() const {
@@ -254,6 +263,9 @@ struct C10_API ExtraMeta {
         strides_,
         numel_,
         storage_offset_,
+        is_contiguous_,
+        is_channels_last_contiguous_,
+        is_channels_last_3d_contiguous_,
         named_tensor_meta_ ? named_tensor_meta_->clone() : nullptr);
   }
 };
@@ -573,41 +585,87 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     return key_set_;
   }
 
+  // NOTE: The general recipe for customizable methods is that the fastpath
+  // function (e.g., sizes()) does an unlikely policy test, and if doesn't
+  // trigger, it does the fast path implementation with no checks and going
+  // directly to on-TensorImpl fields.  In particular, you never need to
+  // check ExtraMeta if the policy doesn't trigger, as non-trivial ExtraMeta
+  // implies the policy will always match.
+  //
+  // The default implementations of methods are "safe": they do extra tests
+  // to make sure the internal state is consistent no matter if you are
+  // doing symbolic shapes or not.  If you don't want the tests, directly
+  // override the custom method (e.g., custom_sizes()) to do your preferred
+  // behavior.
+
+ public:
   /**
    * Return a reference to the sizes of this tensor.  This reference remains
    * valid as long as the tensor is live and not resized.
    */
   IntArrayRef sizes() const {
-    if (C10_UNLIKELY(
-            sizes_strides_policy_ >=
-            static_cast<uint8_t>(SizesStridesPolicy::CustomSizes))) {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
       return sizes_custom();
     }
-    return sizes_default();
+    return sizes_and_strides_.sizes_arrayref();
   }
 
-  // TODO: make it non-virtual after a change to XLA
-  virtual c10::SymIntArrayRef sym_sizes() const {
-    if (C10_UNLIKELY(
-            sizes_strides_policy_ >=
-            static_cast<uint8_t>(SizesStridesPolicy::CustomSizes))) {
+  SymIntArrayRef sym_sizes() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
       return sym_sizes_custom();
     }
-    return sym_sizes_default();
+    // Sizes guaranteed to be non-negative, so unchecked cast is OK
+    return c10::fromIntArrayRefKnownNonNegative(
+        sizes_and_strides_.sizes_arrayref());
   }
 
-  virtual c10::SymIntArrayRef sym_sizes_custom() const;
+  IntArrayRef sizes_default() const {
+    // TODO: force backtrace to be printed on this error
+    TORCH_CHECK(
+        !has_symbolic_sizes_strides_,
+        "Cannot call sizes() on tensor with symbolic sizes/strides");
+    return sizes_and_strides_.sizes_arrayref();
+  }
+
+  SymIntArrayRef sym_sizes_default() const {
+    if (has_symbolic_sizes_strides_) {
+      return extra_meta_->sizes_;
+    } else {
+      // Sizes guaranteed to be non-negative, so unchecked cast is OK
+      return c10::fromIntArrayRefKnownNonNegative(sizes_default());
+    }
+  }
+
+  /**
+   * The number of elements in a tensor.
+   *
+   * WARNING: Previously, if you were using the Caffe2 API, you could
+   * test numel() == -1 to see if a tensor was uninitialized.  This
+   * is no longer true; numel always accurately reports the product
+   * of sizes of a tensor.
+   */
+  int64_t numel() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return numel_custom();
+    }
+    return numel_;
+  }
 
   c10::SymInt sym_numel() const {
-    if (C10_UNLIKELY(
-            sizes_strides_policy_ >=
-            static_cast<uint8_t>(SizesStridesPolicy::CustomSizes))) {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
       return sym_numel_custom();
     }
-    return sym_numel_default();
+    return c10::SymInt(SymInt::UNCHECKED, numel_);
   }
 
-  inline c10::SymInt sym_numel_default() const {
+  int64_t numel_default() const {
+    TORCH_CHECK(
+        !has_symbolic_sizes_strides_,
+        "Cannot call numel() on tensor with symbolic sizes/strides");
+    return numel_;
+  }
+
+  c10::SymInt sym_numel_default() const {
     if (has_symbolic_sizes_strides_) {
       return extra_meta_->numel_;
     } else {
@@ -615,41 +673,132 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     }
   }
 
-  virtual c10::SymInt sym_numel_custom() const;
+  /**
+   * Return the number of dimensions of this tensor.  Note that 0-dimension
+   * represents a Tensor that is a Scalar, e.g., one that has a single element.
+   */
+  int64_t dim() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return dim_custom();
+    }
+    return sizes_and_strides_.size();
+  }
+
+  int64_t dim_default() const {
+    if (has_symbolic_sizes_strides_) {
+      return extra_meta_->sizes_.size();
+    } else {
+      return sizes_and_strides_.size();
+    }
+  }
+
+  /**
+   * Return the offset in number of elements into the storage that this
+   * tensor points to.  Most tensors have storage_offset() == 0, but,
+   * for example, an index into a tensor will have a non-zero storage_offset().
+   *
+   * WARNING: This is NOT computed in bytes.
+   */
+  int64_t storage_offset() const {
+    // TODO: maybe this should be toggled by strides
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return storage_offset_custom();
+    }
+    return storage_offset_;
+  }
+
+  c10::SymInt sym_storage_offset() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return sym_storage_offset_custom();
+    }
+    return c10::SymInt(SymInt::UNCHECKED, storage_offset_);
+  }
+
+  int64_t storage_offset_default() const {
+    TORCH_CHECK(
+        !has_symbolic_sizes_strides_,
+        "Cannot call storage_offset() on tensor with symbolic sizes/strides");
+    return storage_offset_;
+  }
+
+  c10::SymInt sym_storage_offset_default() const {
+    if (has_symbolic_sizes_strides_) {
+      return extra_meta_->storage_offset_;
+    } else {
+      return c10::SymInt(SymInt::UNCHECKED, storage_offset_);
+    }
+  }
 
   /**
    * Return a reference to the strides of this tensor.  This reference remains
    * valid as long as the tensor is live and not restrided.
    */
   IntArrayRef strides() const {
-    if (C10_UNLIKELY(
-            sizes_strides_policy_ >=
-            static_cast<uint8_t>(SizesStridesPolicy::CustomStrides))) {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
       return strides_custom();
     }
-    return strides_default();
+    return sizes_and_strides_.strides_arrayref();
   }
 
-  // TODO: make it non-virtual after a change to XLA
-  virtual c10::SymIntArrayRef sym_strides() const {
-    if (C10_UNLIKELY(
-            sizes_strides_policy_ >=
-            static_cast<uint8_t>(SizesStridesPolicy::CustomStrides))) {
+  c10::SymIntArrayRef sym_strides() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
       return sym_strides_custom();
     }
-    return sym_strides_default();
+    return c10::fromIntArrayRefKnownNonNegative(strides_default());
+  }
+
+  IntArrayRef strides_default() const {
+    TORCH_CHECK(
+        !has_symbolic_sizes_strides_,
+        "Cannot call strides() on tensor with symbolic sizes/strides");
+    return sizes_and_strides_.strides_arrayref();
   }
-  inline c10::SymIntArrayRef sym_strides_default() const {
+
+  c10::SymIntArrayRef sym_strides_default() const {
     if (has_symbolic_sizes_strides_) {
       return extra_meta_->strides_;
     } else {
-      return c10::SymIntArrayRef::fromIntArrayRefKnownNonNegative(
-          strides_default());
+      return c10::fromIntArrayRefKnownNonNegative(strides_default());
     }
   }
 
-  virtual c10::SymIntArrayRef sym_strides_custom() const;
+  /**
+   * Whether or not a tensor is laid out in contiguous memory.
+   *
+   * Tensors with non-trivial strides are not contiguous.  See
+   * compute_contiguous() for the exact definition of whether or not
+   * a tensor is contiguous or not.
+   */
+  bool is_contiguous(
+      at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
+      return is_contiguous_custom(memory_format);
+    }
+    return is_contiguous_default(memory_format);
+  }
+
+  // These are factored into separate functions in case subclasses
+  // want to use them
+  bool is_contiguous_default(at::MemoryFormat memory_format) const {
+    if (has_symbolic_sizes_strides_) {
+      if (memory_format == at::MemoryFormat::ChannelsLast) {
+        return extra_meta_->is_channels_last_contiguous_;
+      } else if (memory_format == at::MemoryFormat::ChannelsLast3d) {
+        return extra_meta_->is_channels_last_3d_contiguous_;
+      }
+      return extra_meta_->is_contiguous_;
+    }
+
+    if (memory_format == at::MemoryFormat::ChannelsLast) {
+      return is_channels_last_contiguous_;
+    } else if (memory_format == at::MemoryFormat::ChannelsLast3d) {
+      return is_channels_last_3d_contiguous_;
+    }
+    return is_contiguous_;
+  }
 
+  // NB: these dim accessor functions don't have _default(), as you can use
+  // sizes_default/strides_default
   /**
    * Return the size of a tensor at some dimension, wrapping the dimension if
    * necessary.
@@ -658,9 +807,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * be faster
    */
   int64_t size(int64_t d) const {
-    if (C10_UNLIKELY(
-            sizes_strides_policy_ >=
-            static_cast<uint8_t>(SizesStridesPolicy::CustomSizes))) {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
       return size_custom(d);
     }
     d = maybe_wrap_dim(d, dim(), /*wrap_scalar=*/false);
@@ -668,9 +815,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
   c10::SymInt sym_size(int64_t d) const {
-    if (C10_UNLIKELY(
-            sizes_strides_policy_ >=
-            static_cast<uint8_t>(SizesStridesPolicy::CustomSizes))) {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
       return sym_size_custom(d);
     }
     d = maybe_wrap_dim(d, dim(), /*wrap_scalar=*/false);
@@ -687,79 +832,49 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    */
   int64_t stride(int64_t d) const {
     d = maybe_wrap_dim(d, dim(), false);
-    if (C10_UNLIKELY(
-            sizes_strides_policy_ >=
-            static_cast<uint8_t>(SizesStridesPolicy::CustomStrides))) {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
+      // TODO: provide stride_custom, symmetrically with size_custom.
+      // There is presently no user for it; only NestedTensor is using
+      // size_custom overrideability
       return strides_custom()[d]; // unchecked (maybe_wrap_dim enforces bounds)
     }
+    // Intentionally don't call default, which also handles symbolic
     return sizes_and_strides_.stride_at_unchecked(d);
   }
 
-  /**
-   * Return the number of dimensions of this tensor.  Note that 0-dimension
-   * represents a Tensor that is a Scalar, e.g., one that has a single element.
-   */
-  int64_t dim() const {
-    if (C10_UNLIKELY(
-            sizes_strides_policy_ >=
-            static_cast<uint8_t>(SizesStridesPolicy::CustomSizes))) {
-      return dim_custom();
-    }
-    return dim_default();
-  }
-
-  /**
-   * The number of elements in a tensor.
-   *
-   * WARNING: Previously, if you were using the Caffe2 API, you could
-   * test numel() == -1 to see if a tensor was uninitialized.  This
-   * is no longer true; numel always accurately reports the product
-   * of sizes of a tensor.
-   */
-  int64_t numel() const {
-    if (C10_UNLIKELY(
-            sizes_strides_policy_ >=
-            static_cast<uint8_t>(SizesStridesPolicy::CustomSizes))) {
-      return numel_custom();
-    }
-    return numel_default();
-  }
-
-  /**
-   * Whether or not a tensor is laid out in contiguous memory.
-   *
-   * Tensors with non-trivial strides are not contiguous.  See
-   * compute_contiguous() for the exact definition of whether or not
-   * a tensor is contiguous or not.
-   */
-  bool is_contiguous(
-      at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) const {
-    if (C10_UNLIKELY(
-            sizes_strides_policy_ >=
-            static_cast<uint8_t>(SizesStridesPolicy::CustomStrides))) {
-      return is_contiguous_custom(memory_format);
-    }
-    return is_contiguous_default(memory_format);
-  }
+  enum class SizesStridesPolicy : uint8_t {
+    // Default behavior, e.g., dense tensor.
+    //
+    // Can override: nothing
+    Default = 0,
+    // Customizable strides behavior, e.g., sparse tensor,
+    // mkldnn tensor.
+    //
+    // Can override: strides(), is_contiguous()
+    CustomStrides = 1,
+    // Customizable sizes behavior, e.g., nested tensor
+    //
+    // Can override: strides(), is_contiguous(), sizes(), dim(), numel()
+    CustomSizes = 2
+  };
 
-  inline IntArrayRef strides_default() const {
-    return sizes_and_strides_.strides_arrayref();
+ protected:
+  inline bool matches_policy(SizesStridesPolicy policy) const {
+    return sizes_strides_policy_ >= static_cast<uint8_t>(policy);
   }
 
-  inline IntArrayRef sizes_default() const {
-    return sizes_and_strides_.sizes_arrayref();
+  inline bool matches_custom(SizesStridesPolicy policy) const {
+    return custom_sizes_strides_ >= static_cast<uint8_t>(policy);
   }
 
-  inline c10::SymIntArrayRef sym_sizes_default() const {
-    if (has_symbolic_sizes_strides_) {
-      return extra_meta_->sizes_;
-    } else {
-      return c10::SymIntArrayRef::fromIntArrayRefKnownNonNegative(
-          sizes_default());
+  inline bool matches_python_custom(SizesStridesPolicy policy) const {
+    auto r = python_custom_sizes_strides_ >= static_cast<uint8_t>(policy);
+    if (r) {
+      TORCH_INTERNAL_ASSERT(is_python_dispatch())
     }
+    return r;
   }
 
- protected:
   /**
    * Customization points for the functions above.  sizes_strides_policy_
    * must be set to enable these.
@@ -768,7 +883,6 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * for a tensor to have rank, but not well defined sizes.
    */
   // sizes_strides_policy_ >= CustomStrides
-  virtual IntArrayRef strides_custom() const;
   virtual bool is_contiguous_custom(at::MemoryFormat memory_format) const;
   // sizes_strides_policy_ >= CustomSizes
   // Currently this method only exists to be overwritten by subclasses such as
@@ -790,38 +904,17 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
   virtual IntArrayRef sizes_custom() const;
+  virtual IntArrayRef strides_custom() const;
+  virtual int64_t numel_custom() const;
+  virtual int64_t storage_offset_custom() const;
+  virtual int64_t dim_custom() const;
   virtual Device device_custom() const;
   virtual Layout layout_custom() const;
 
-  virtual int64_t dim_custom() const;
-  virtual int64_t numel_custom() const;
-
-  // These are factored into separate functions in case subclasses
-  // want to use them
-  inline bool is_contiguous_default(at::MemoryFormat memory_format) const {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(compute_contiguous() == is_contiguous_);
-    if (memory_format == at::MemoryFormat::ChannelsLast) {
-      return is_channels_last_contiguous_;
-    } else if (memory_format == at::MemoryFormat::ChannelsLast3d) {
-      return is_channels_last_3d_contiguous_;
-    }
-    return is_contiguous_;
-  }
-  inline int64_t dim_default() const {
-    return sizes_and_strides_.size();
-  }
-  inline c10::Device device_default() const {
-    TORCH_CHECK(device_opt_.has_value(), "tensor does not have a device");
-    // See NOTE [c10::optional operator usage in CUDA]
-    return *device_opt_;
-  }
-
-  inline int64_t numel_default() const {
-#ifdef DEBUG
-    TORCH_INTERNAL_ASSERT(compute_numel() == numel_);
-#endif
-    return numel_;
-  }
+  virtual c10::SymIntArrayRef sym_sizes_custom() const;
+  virtual c10::SymIntArrayRef sym_strides_custom() const;
+  virtual c10::SymInt sym_numel_custom() const;
+  virtual c10::SymInt sym_storage_offset_custom() const;
 
  public:
   /**
@@ -906,7 +999,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   bool is_meta() const {
     // NB: This method is not virtual and avoid dispatches for performance
     // reasons.
-    if (C10_UNLIKELY(custom_device_)) {
+    if (C10_UNLIKELY(device_policy_)) {
       return device_custom().is_meta();
     }
     return device_opt_.has_value() && device_opt_->type() == kMeta;
@@ -915,7 +1008,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   bool is_cpu() const {
     // NB: This method is not virtual and avoid dispatches for performance
     // reasons.
-    if (C10_UNLIKELY(custom_device_)) {
+    if (C10_UNLIKELY(device_policy_)) {
       return device_custom().is_cpu();
     }
     // Note: we cannot rely on dispatch keys to determine the device type
@@ -927,7 +1020,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   bool is_cuda() const {
     // NB: This method is not virtual and avoid dispatches for performance
     // reasons.
-    if (C10_UNLIKELY(custom_device_)) {
+    if (C10_UNLIKELY(device_policy_)) {
       return device_custom().is_cuda();
     }
     return device_opt_.has_value() && device_opt_->type() == kCUDA;
@@ -936,35 +1029,35 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   bool is_xpu() const {
     // NB: This method is not virtual and avoid dispatches for performance
     // reasons.
-    if (C10_UNLIKELY(custom_device_)) {
+    if (C10_UNLIKELY(device_policy_)) {
       return device_custom().is_xpu();
     }
     return device_opt_.has_value() && device_opt_->type() == kXPU;
   }
 
   bool is_ipu() const {
-    if (C10_UNLIKELY(custom_device_)) {
+    if (C10_UNLIKELY(device_policy_)) {
       return device_custom().is_ipu();
     }
     return device_opt_.has_value() && device_opt_->type() == kIPU;
   }
 
   bool is_xla() const {
-    if (C10_UNLIKELY(custom_device_)) {
+    if (C10_UNLIKELY(device_policy_)) {
       return device_custom().is_xla();
     }
     return device_opt_.has_value() && device_opt_->type() == kXLA;
   }
 
   bool is_hpu() const {
-    if (C10_UNLIKELY(custom_device_)) {
+    if (C10_UNLIKELY(device_policy_)) {
       return device_custom().is_hpu();
     }
     return device_opt_.has_value() && device_opt_->type() == kHPU;
   }
 
   bool is_lazy() const {
-    if (C10_UNLIKELY(custom_device_)) {
+    if (C10_UNLIKELY(device_policy_)) {
       return device_custom().is_lazy();
     }
     return device_opt_.has_value() && device_opt_->type() == kLazy;
@@ -973,7 +1066,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   bool is_hip() const {
     // NB: This method is not virtual and avoid dispatches for performance
     // reasons.
-    if (C10_UNLIKELY(custom_device_)) {
+    if (C10_UNLIKELY(device_policy_)) {
       return device_custom().is_hip();
     }
     return device_opt_.has_value() && device_opt_->type() == kHIP;
@@ -982,7 +1075,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   bool is_ve() const {
     // NB: This method is not virtual and avoid dispatches for performance
     // reasons.
-    if (C10_UNLIKELY(custom_device_)) {
+    if (C10_UNLIKELY(device_policy_)) {
       return device_custom().is_ve();
     }
     return device_opt_.has_value() && device_opt_->type() == kVE;
@@ -993,28 +1086,28 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
   bool is_vulkan() const {
-    if (C10_UNLIKELY(custom_device_)) {
+    if (C10_UNLIKELY(device_policy_)) {
       return device_custom().is_vulkan();
     }
     return device_opt_.has_value() && device_opt_->type() == kVulkan;
   }
 
   bool is_metal() const {
-    if (C10_UNLIKELY(custom_device_)) {
+    if (C10_UNLIKELY(device_policy_)) {
       return device_custom().is_metal();
     }
     return device_opt_.has_value() && device_opt_->type() == kMetal;
   }
 
   bool is_mps() const {
-    if (C10_UNLIKELY(custom_device_)) {
+    if (C10_UNLIKELY(device_policy_)) {
       return device_custom().is_mps();
     }
     return device_opt_.has_value() && device_opt_->type() == kMPS;
   }
 
   bool is_ort() const {
-    if (C10_UNLIKELY(custom_device_)) {
+    if (C10_UNLIKELY(device_policy_)) {
       return device_custom().is_ort();
     }
     return device_opt_.has_value() && device_opt_->type() == kORT;
@@ -1046,21 +1139,29 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
   int64_t get_device() const {
-    if (C10_UNLIKELY(custom_device_)) {
+    if (C10_UNLIKELY(device_policy_)) {
       return device_custom().index();
     }
     return device_default().index();
   }
 
   Device device() const {
-    if (C10_UNLIKELY(custom_device_)) {
+    if (C10_UNLIKELY(device_policy_)) {
       return device_custom();
     }
     return device_default();
   }
 
+ protected:
+  c10::Device device_default() const {
+    TORCH_CHECK(device_opt_.has_value(), "tensor does not have a device");
+    // See NOTE [c10::optional operator usage in CUDA]
+    return *device_opt_;
+  }
+
+ public:
   Layout layout() const {
-    if (C10_UNLIKELY(custom_layout_)) {
+    if (C10_UNLIKELY(layout_policy_)) {
       return layout_custom();
     }
 
@@ -1385,17 +1486,6 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     return data_type_.itemsize();
   }
 
-  /**
-   * Return the offset in number of elements into the storage that this
-   * tensor points to.  Most tensors have storage_offset() == 0, but,
-   * for example, an index into a tensor will have a non-zero storage_offset().
-   *
-   * WARNING: This is NOT computed in bytes.
-   */
-  TENSORIMPL_MAYBE_VIRTUAL int64_t storage_offset() const {
-    return storage_offset_;
-  }
-
  protected:
   /**
    * Returns the human-readable name of the actual type of this object (e.g.,
@@ -1418,9 +1508,10 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
 
   // if we are going to use sym sizes, we should be setting sym strides at the
   // same time, otherwise it's very easy to misuse this API
-  void set_sym_sizes_and_strides(
+  void set_sizes_and_strides(
       c10::SymIntArrayRef sizes,
-      c10::SymIntArrayRef strides);
+      c10::SymIntArrayRef strides,
+      c10::optional<c10::SymInt> storage_offset = c10::nullopt);
 
   /**
    * Change the size at some dimension.  This DOES NOT update strides;
@@ -1436,8 +1527,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
         "set_size ",
         err_msg_tensor_metadata_change_not_allowed);
     TORCH_CHECK(
-        !has_symbolic_sizes_strides_,
-        "set_size() called on tensor with symbolic shape")
+        !matches_policy(SizesStridesPolicy::CustomSizes),
+        "set_size() called on tensor with dynamic shapes or customized size behavior")
     sizes_and_strides_.size_at(dim) = new_size;
     refresh_numel();
     refresh_contiguous();
@@ -1473,6 +1564,10 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
         allow_tensor_metadata_change(),
         "set_storage_offset ",
         err_msg_tensor_metadata_change_not_allowed);
+    // TODO: this should probably consult policy
+    TORCH_CHECK(
+        !has_symbolic_sizes_strides_,
+        "set_storage_offset() called on tensor with symbolic shape")
     storage_offset_ = storage_offset;
   }
 
@@ -1488,15 +1583,9 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
         allow_tensor_metadata_change(),
         "set_sizes_contiguous ",
         err_msg_tensor_metadata_change_not_allowed);
-    if (C10_UNLIKELY(
-            sizes_strides_policy_ >=
-            static_cast<uint8_t>(SizesStridesPolicy::CustomStrides))) {
-      TORCH_CHECK(false, "todo, I guess we want to throw here");
-    }
-
     TORCH_CHECK(
-        !has_symbolic_sizes_strides_,
-        "set_sizes_contiguous() called on tensor with symbolic shape")
+        !matches_policy(SizesStridesPolicy::CustomStrides),
+        "tried to directly modify sizes for customized tensor");
     sizes_and_strides_.set_sizes(new_size);
 
     refresh_numel();
@@ -1510,7 +1599,10 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * sizes/strides are in bounds for the storage that is allocated;
    * this is the responsibility of the caller
    */
-  void set_sizes_and_strides(IntArrayRef new_size, IntArrayRef new_stride) {
+  void set_sizes_and_strides(
+      IntArrayRef new_size,
+      IntArrayRef new_stride,
+      c10::optional<int64_t> storage_offset = c10::nullopt) {
     TORCH_CHECK(
         allow_tensor_metadata_change(),
         "set_sizes_and_strides ",
@@ -1554,6 +1646,10 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
 
     refresh_numel();
     refresh_contiguous();
+
+    if (storage_offset.has_value()) {
+      storage_offset_ = *storage_offset;
+    }
   }
 
   /**
@@ -2438,32 +2534,53 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
  public:
-  enum class SizesStridesPolicy : uint8_t {
-    // Default behavior, e.g., dense tensor.
-    //
-    // Can override: nothing
-    Default = 0,
-    // Customizable strides behavior, e.g., sparse tensor,
-    // mkldnn tensor.
-    //
-    // Can override: strides(), is_contiguous()
-    CustomStrides = 1,
-    // Customizable sizes behavior, e.g., nested tensor
-    //
-    // Can override: strides(), is_contiguous(), sizes(), dim(), numel()
-    CustomSizes = 2
-  };
+  void set_custom_sizes_strides(SizesStridesPolicy policy) {
+    custom_sizes_strides_ = static_cast<uint8_t>(policy);
+    refresh_sizes_strides_policy();
+  }
 
-  void set_sizes_strides_policy(SizesStridesPolicy policy) {
-    sizes_strides_policy_ = static_cast<uint8_t>(policy);
+  void set_python_custom_sizes_strides(SizesStridesPolicy policy) {
+    python_custom_sizes_strides_ = static_cast<uint8_t>(policy);
+    refresh_sizes_strides_policy();
   }
 
   void set_custom_device(bool custom_device) {
     custom_device_ = custom_device;
+    refresh_device_policy();
   }
 
   void set_custom_layout(bool custom_layout) {
     custom_layout_ = custom_layout;
+    refresh_layout_policy();
+  }
+
+  void set_python_custom_device(bool custom_device) {
+    python_custom_device_ = custom_device;
+    refresh_device_policy();
+  }
+
+  void set_python_custom_layout(bool custom_layout) {
+    python_custom_layout_ = custom_layout;
+    refresh_layout_policy();
+  }
+
+ protected:
+  void refresh_sizes_strides_policy() {
+    if (has_symbolic_sizes_strides_) {
+      sizes_strides_policy_ =
+          static_cast<uint8_t>(SizesStridesPolicy::CustomSizes);
+    } else {
+      sizes_strides_policy_ =
+          std::max(custom_sizes_strides_, python_custom_sizes_strides_);
+    }
+  }
+
+  void refresh_device_policy() {
+    device_policy_ = custom_device_ || python_custom_device_;
+  }
+
+  void refresh_layout_policy() {
+    layout_policy_ = custom_layout_ || python_custom_layout_;
   }
 
  protected:
@@ -2584,8 +2701,15 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     allow_tensor_metadata_change_ = true;
     reserved_ = false;
     sizes_strides_policy_ = static_cast<uint8_t>(SizesStridesPolicy::Default);
+    custom_sizes_strides_ = static_cast<uint8_t>(SizesStridesPolicy::Default);
+    python_custom_sizes_strides_ =
+        static_cast<uint8_t>(SizesStridesPolicy::Default);
+    python_custom_device_ = false;
+    python_custom_layout_ = false;
     custom_device_ = false;
     custom_layout_ = false;
+    device_policy_ = false;
+    layout_policy_ = false;
     storage_access_should_throw_ = false;
     has_symbolic_sizes_strides_ = false;
   }
@@ -2648,17 +2772,37 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
 
   // Call _custom() virtual methods for
   // strides()/is_contiguous()/sizes()/dim()/numel()
+  // This is a combination of sizes_strides_custom_dispatch_
+  // and has_symbolic_sizes_strides_
   uint8_t sizes_strides_policy_ : 2;
 
   // Whether or not sizes_and_strides_ contains a symbolic value.
   bool has_symbolic_sizes_strides_ : 1;
 
+  // Call _custom() virtual method for
+  // strides()/is_contiguous()/sizes()/dim()/numel()
+  uint8_t custom_sizes_strides_ : 2;
+
+  // Combo of custom_ and python_custom_
+  bool device_policy_ : 1;
+  bool layout_policy_ : 1;
+
   // Call _custom() virtual method for device()
   bool custom_device_ : 1;
 
   // Call _custom() virtual method for layout()
   bool custom_layout_ : 1;
 
+  // Call into Python for
+  // strides()/is_contiguous()/sizes()/dim()/numel()
+  uint8_t python_custom_sizes_strides_ : 2;
+
+  // Call into Python for device()
+  bool python_custom_device_ : 1;
+
+  // Call into Python for layout()
+  bool python_custom_layout_ : 1;
+
   // The set of DispatchKeys which describe this tensor.  NB: this
   // does NOT include Autograd (historically, it did, but
   // not anymore!)
diff --git a/c10/core/UndefinedTensorImpl.cpp b/c10/core/UndefinedTensorImpl.cpp
index 1c24c17b53d3..1b16a5d5b9fd 100644
--- a/c10/core/UndefinedTensorImpl.cpp
+++ b/c10/core/UndefinedTensorImpl.cpp
@@ -9,12 +9,18 @@ UndefinedTensorImpl::UndefinedTensorImpl()
   set_storage_access_should_throw();
   // TODO: accessing the sizes on an undefined tensor is not meaningful
   // and should error too, but empirically it does not!
-  set_sizes_strides_policy(SizesStridesPolicy::CustomStrides);
+  set_custom_sizes_strides(SizesStridesPolicy::CustomStrides);
 }
 
 bool UndefinedTensorImpl::is_contiguous_custom(MemoryFormat format) const {
   return is_contiguous_default(format);
 }
+IntArrayRef UndefinedTensorImpl::strides_custom() const {
+  TORCH_CHECK(false, "strides() called on an undefined Tensor");
+}
+SymIntArrayRef UndefinedTensorImpl::sym_strides_custom() const {
+  TORCH_CHECK(false, "sym_strides() called on an undefined Tensor");
+}
 
 #ifdef DEBUG
 bool UndefinedTensorImpl::has_storage() const {
diff --git a/c10/core/UndefinedTensorImpl.h b/c10/core/UndefinedTensorImpl.h
index ddf688a569c6..b2a73ddf0a91 100644
--- a/c10/core/UndefinedTensorImpl.h
+++ b/c10/core/UndefinedTensorImpl.h
@@ -25,6 +25,8 @@ struct C10_API UndefinedTensorImpl final : public TensorImpl {
 
  protected:
   bool is_contiguous_custom(MemoryFormat format) const override;
+  IntArrayRef strides_custom() const override;
+  SymIntArrayRef sym_strides_custom() const override;
 
  private:
   UndefinedTensorImpl();
diff --git a/c10/core/impl/PyInterpreter.cpp b/c10/core/impl/PyInterpreter.cpp
index d8c7784a084e..9ab1badde40b 100644
--- a/c10/core/impl/PyInterpreter.cpp
+++ b/c10/core/impl/PyInterpreter.cpp
@@ -27,6 +27,13 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
     PANIC(dispatch);
   }
 
+  void python_dispatcher(
+      const c10::OperatorHandle& op,
+      c10::DispatchKeySet,
+      torch::jit::Stack* stack) const override {
+    PANIC(python_dispatcher);
+  }
+
   bool is_contiguous(const TensorImpl* self) const override {
     PANIC(is_contiguous);
   }
@@ -54,6 +61,9 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
   c10::SymIntArrayRef sym_strides(const TensorImpl* self) const override {
     PANIC(sym_strides);
   }
+  c10::SymInt sym_storage_offset(const TensorImpl* self) const override {
+    PANIC(sym_storage_offset);
+  }
 
   // Just swallow the event, don't do anything
   void trace_gpu_event_creation(uintptr_t event) const override {}
@@ -64,11 +74,15 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
   void trace_gpu_memory_allocation(uintptr_t ptr) const override {}
   void trace_gpu_memory_deallocation(uintptr_t ptr) const override {}
   void trace_gpu_stream_creation(uintptr_t stream) const override {}
+  void trace_gpu_device_synchronization() const override {}
+  void trace_gpu_stream_synchronization(uintptr_t stream) const override {}
+  void trace_gpu_event_synchronization(uintptr_t event) const override {}
 };
 
 void PyInterpreter::disarm() noexcept {
-  static NoopPyInterpreterVTable noop_vtable;
-  vtable_ = &noop_vtable;
+  // Intentionally leaked
+  static PyInterpreterVTable* noop_vtable = new NoopPyInterpreterVTable();
+  vtable_ = noop_vtable;
 }
 
 } // namespace impl
diff --git a/c10/core/impl/PyInterpreter.h b/c10/core/impl/PyInterpreter.h
index 2a5635cb54e0..6fe2897214d3 100644
--- a/c10/core/impl/PyInterpreter.h
+++ b/c10/core/impl/PyInterpreter.h
@@ -140,6 +140,12 @@ struct C10_API PyInterpreterVTable {
   virtual void dispatch(const c10::OperatorHandle& op, torch::jit::Stack* stack)
       const = 0;
 
+  // Invoke the Python dispatcher to handle this call
+  virtual void python_dispatcher(
+      const c10::OperatorHandle& op,
+      c10::DispatchKeySet,
+      torch::jit::Stack* stack) const = 0;
+
   virtual bool is_contiguous(const TensorImpl* self) const = 0;
   virtual c10::Device device(const TensorImpl* self) const = 0;
   virtual int64_t dim(const TensorImpl* self) const = 0;
@@ -149,6 +155,7 @@ struct C10_API PyInterpreterVTable {
   virtual c10::Layout layout(const TensorImpl* self) const = 0;
   virtual c10::SymInt sym_numel(const TensorImpl* self) const = 0;
   virtual c10::SymIntArrayRef sym_strides(const TensorImpl* self) const = 0;
+  virtual c10::SymInt sym_storage_offset(const TensorImpl* self) const = 0;
 
   virtual void trace_gpu_event_creation(uintptr_t event) const = 0;
   virtual void trace_gpu_event_deletion(uintptr_t event) const = 0;
@@ -159,6 +166,9 @@ struct C10_API PyInterpreterVTable {
   virtual void trace_gpu_memory_allocation(uintptr_t ptr) const = 0;
   virtual void trace_gpu_memory_deallocation(uintptr_t ptr) const = 0;
   virtual void trace_gpu_stream_creation(uintptr_t stream) const = 0;
+  virtual void trace_gpu_device_synchronization() const = 0;
+  virtual void trace_gpu_stream_synchronization(uintptr_t stream) const = 0;
+  virtual void trace_gpu_event_synchronization(uintptr_t event) const = 0;
 };
 
 struct C10_API PyInterpreter {
diff --git a/c10/core/impl/PythonDispatcherTLS.cpp b/c10/core/impl/PythonDispatcherTLS.cpp
new file mode 100644
index 000000000000..dd6bd52b3207
--- /dev/null
+++ b/c10/core/impl/PythonDispatcherTLS.cpp
@@ -0,0 +1,32 @@
+#include <c10/core/DispatchKeySet.h>
+#include <c10/core/SafePyObject.h>
+#include <c10/core/impl/LocalDispatchKeySet.h>
+#include <c10/core/impl/PythonDispatcherTLS.h>
+
+namespace c10 {
+namespace impl {
+
+thread_local PyInterpreter* pythonDispatcherState;
+
+void PythonDispatcherTLS::set_state(PyInterpreter* state) {
+  if (state) {
+    c10::impl::tls_set_dispatch_key_included(
+        DispatchKey::PythonDispatcher, true);
+  } else {
+    PythonDispatcherTLS::reset_state();
+  }
+  pythonDispatcherState = state;
+}
+
+PyInterpreter* PythonDispatcherTLS::get_state() {
+  return pythonDispatcherState;
+}
+
+void PythonDispatcherTLS::reset_state() {
+  pythonDispatcherState = nullptr;
+  c10::impl::tls_set_dispatch_key_included(
+      DispatchKey::PythonDispatcher, false);
+}
+
+} // namespace impl
+} // namespace c10
diff --git a/c10/core/impl/PythonDispatcherTLS.h b/c10/core/impl/PythonDispatcherTLS.h
new file mode 100644
index 000000000000..1c055a59fb15
--- /dev/null
+++ b/c10/core/impl/PythonDispatcherTLS.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <c10/core/SafePyObject.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Optional.h>
+
+namespace c10 {
+namespace impl {
+
+struct C10_API PythonDispatcherTLS {
+  static void set_state(PyInterpreter* state);
+  static PyInterpreter* get_state();
+  static void reset_state();
+};
+
+struct C10_API DisablePythonDispatcher {
+  DisablePythonDispatcher() : old_(PythonDispatcherTLS::get_state()) {
+    PythonDispatcherTLS::set_state({});
+  }
+  ~DisablePythonDispatcher() {
+    PythonDispatcherTLS::set_state(old_);
+  }
+  PyInterpreter* old_;
+};
+
+} // namespace impl
+} // namespace c10
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 1b5f88785533..0f1aee752b09 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -416,85 +416,92 @@ class CachingAllocatorConfig {
     return instance().m_roundup_power2_divisions;
   }
 
- private:
   static CachingAllocatorConfig& instance() {
     static CachingAllocatorConfig* s_instance = ([]() {
       auto inst = new CachingAllocatorConfig();
-      inst->parseArgs();
+      const char* env = getenv("PYTORCH_CUDA_ALLOC_CONF");
+      inst->parseArgs(env);
       return inst;
     })();
     return *s_instance;
   }
 
-  CachingAllocatorConfig()
-      : m_max_split_size(std::numeric_limits<size_t>::max()),
-        m_roundup_power2_divisions(0),
-        m_garbage_collection_threshold(0) {}
-  size_t m_max_split_size;
-  size_t m_roundup_power2_divisions;
-  double m_garbage_collection_threshold;
-
-  void parseArgs() {
-    const char* val = getenv("PYTORCH_CUDA_ALLOC_CONF");
-    if (val != NULL) {
-      const std::string config(val);
-
-      std::regex exp("[\\s,]+");
-      std::sregex_token_iterator it(config.begin(), config.end(), exp, -1);
-      std::sregex_token_iterator end;
-      std::vector<std::string> options(it, end);
-
-      for (auto option : options) {
-        std::regex exp2("[:]+");
-        std::sregex_token_iterator it2(option.begin(), option.end(), exp2, -1);
-        std::sregex_token_iterator end2;
-        std::vector<std::string> kv(it2, end2);
-        if (kv.size() >= 2) {
-          /* Maximum split size in MB.  Limited to large size blocks */
-          if (kv[0].compare("max_split_size_mb") == 0) {
-            size_t val2 = stoi(kv[1]);
-            TORCH_CHECK(
-                val2 > kLargeBuffer / (1024 * 1024),
-                "CachingAllocator option max_split_size_mb too small, must be > ",
-                kLargeBuffer / (1024 * 1024),
-                "");
-            val2 = std::max(val2, kLargeBuffer / (1024 * 1024));
-            val2 = std::min(
-                val2, (std::numeric_limits<size_t>::max() / (1024 * 1024)));
-            m_max_split_size = val2 * 1024 * 1024;
-          } else if (kv[0].compare("roundup_power2_divisions") == 0) {
-            size_t val2 = stoi(kv[1]);
-            TORCH_CHECK(
-                llvm::isPowerOf2_64(val2),
-                "For roundups, the divisons has to be power of 2 ",
-                "");
-            m_roundup_power2_divisions = val2;
-          } else if (kv[0].compare("garbage_collection_threshold") == 0) {
-            /*
-             * Perform garbage collection of GPU memory blocks to avoid
-             * triggering expensive sync-and-reclaim-all operation. Upon setting
-             * the threshold (e.g., 0.8), the allocator will start reclaiming
-             * blocks if GPU memory capacity usage exceeds the threshold (i.e.,
-             * 80% of total memory).
-             * Values 0.0 and 1.0 are not allowed as they are less meaningful.
-             */
-            double val2 = stod(kv[1]);
-            TORCH_CHECK(
-                val2 > 0,
-                "garbage_collect_threshold too small, set it 0.0~1.0",
-                "");
-            TORCH_CHECK(
-                val2 < 1.0,
-                "garbage_collect_threshold too big, set it 0.0~1.0",
-                "");
-            m_garbage_collection_threshold = val2;
-          } else {
-            TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", kv[0]);
-          }
+  void parseArgs(const char* env) {
+    // If empty, set the default values
+    m_max_split_size = std::numeric_limits<size_t>::max();
+    m_roundup_power2_divisions = 0;
+    m_garbage_collection_threshold = 0;
+
+    if (env == nullptr) {
+      return;
+    }
+
+    const std::string config(env);
+
+    std::regex exp("[\\s,]+");
+    std::sregex_token_iterator it(config.begin(), config.end(), exp, -1);
+    std::sregex_token_iterator end;
+    std::vector<std::string> options(it, end);
+
+    for (auto option : options) {
+      std::regex exp2("[:]+");
+      std::sregex_token_iterator it2(option.begin(), option.end(), exp2, -1);
+      std::sregex_token_iterator end2;
+      std::vector<std::string> kv(it2, end2);
+      if (kv.size() >= 2) {
+        /* Maximum split size in MB.  Limited to large size blocks */
+        if (kv[0].compare("max_split_size_mb") == 0) {
+          size_t val2 = stoi(kv[1]);
+          TORCH_CHECK(
+              val2 > kLargeBuffer / (1024 * 1024),
+              "CachingAllocator option max_split_size_mb too small, must be > ",
+              kLargeBuffer / (1024 * 1024),
+              "");
+          val2 = std::max(val2, kLargeBuffer / (1024 * 1024));
+          val2 = std::min(
+              val2, (std::numeric_limits<size_t>::max() / (1024 * 1024)));
+          m_max_split_size = val2 * 1024 * 1024;
+        } else if (kv[0].compare("roundup_power2_divisions") == 0) {
+          size_t val2 = stoi(kv[1]);
+          TORCH_CHECK(
+              llvm::isPowerOf2_64(val2),
+              "For roundups, the divisons has to be power of 2 ",
+              "");
+          m_roundup_power2_divisions = val2;
+        } else if (kv[0].compare("garbage_collection_threshold") == 0) {
+          /*
+           * Perform garbage collection of GPU memory blocks to avoid
+           * triggering expensive sync-and-reclaim-all operation. Upon setting
+           * the threshold (e.g., 0.8), the allocator will start reclaiming
+           * blocks if GPU memory capacity usage exceeds the threshold (i.e.,
+           * 80% of total memory).
+           * Values 0.0 and 1.0 are not allowed as they are less meaningful.
+           */
+          double val2 = stod(kv[1]);
+          TORCH_CHECK(
+              val2 > 0,
+              "garbage_collect_threshold too small, set it 0.0~1.0",
+              "");
+          TORCH_CHECK(
+              val2 < 1.0,
+              "garbage_collect_threshold too big, set it 0.0~1.0",
+              "");
+          m_garbage_collection_threshold = val2;
+        } else {
+          TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", kv[0]);
         }
       }
     }
   }
+
+ private:
+  CachingAllocatorConfig()
+      : m_max_split_size(std::numeric_limits<size_t>::max()),
+        m_roundup_power2_divisions(0),
+        m_garbage_collection_threshold(0) {}
+  std::atomic<size_t> m_max_split_size;
+  std::atomic<size_t> m_roundup_power2_divisions;
+  std::atomic<double> m_garbage_collection_threshold;
 };
 
 class DeviceCachingAllocator {
@@ -1849,6 +1856,10 @@ void setContextRecorder(CreateContextFn recorder) {
   caching_allocator.setContextRecorder(std::move(recorder));
 }
 
+void setAllocatorSettings(const std::string& env) {
+  CachingAllocatorConfig::instance().parseArgs(env.c_str());
+}
+
 void emptyCache(void) {
   caching_allocator.emptyCache();
 }
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
index 0fd23f4e61d5..a8f165907d52 100644
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@@ -138,6 +138,7 @@ C10_CUDA_API void raw_delete(void* ptr);
 C10_CUDA_API Allocator* get();
 C10_CUDA_API void init(int device_count);
 C10_CUDA_API void setMemoryFraction(double fraction, int device);
+C10_CUDA_API void setAllocatorSettings(const std::string& env);
 C10_CUDA_API void emptyCache();
 C10_CUDA_API void cacheInfo(
     int dev_id,
diff --git a/c10/cuda/CUDAFunctions.cpp b/c10/cuda/CUDAFunctions.cpp
index 9ab61aa1f381..fefc5ec1665a 100644
--- a/c10/cuda/CUDAFunctions.cpp
+++ b/c10/cuda/CUDAFunctions.cpp
@@ -132,6 +132,10 @@ void set_device(DeviceIndex device) {
 }
 
 void device_synchronize() {
+  const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+  if (C10_UNLIKELY(interp)) {
+    (*interp)->trace_gpu_device_synchronization();
+  }
   C10_CUDA_CHECK(cudaDeviceSynchronize());
 }
 
diff --git a/c10/cuda/CUDAFunctions.h b/c10/cuda/CUDAFunctions.h
index 5b1e8b27b979..32b0ae62506d 100644
--- a/c10/cuda/CUDAFunctions.h
+++ b/c10/cuda/CUDAFunctions.h
@@ -8,6 +8,7 @@
 // The naming convention used here matches the naming convention of torch.cuda
 
 #include <c10/core/Device.h>
+#include <c10/core/impl/GPUTrace.h>
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAMacros.h>
 #include <cuda_runtime_api.h>
@@ -69,6 +70,11 @@ C10_CUDA_API void __inline__ memcpy_and_sync(
           warning_state().get_sync_debug_mode() != SyncDebugMode::L_DISABLED)) {
     warn_or_error_on_sync();
   }
+  const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+  if (C10_UNLIKELY(interp)) {
+    (*interp)->trace_gpu_stream_synchronization(
+        reinterpret_cast<uintptr_t>(stream));
+  }
 #if defined(TORCH_HIP_VERSION) && (TORCH_HIP_VERSION >= 301)
   C10_CUDA_CHECK(hipMemcpyWithStream(dst, src, nbytes, kind, stream));
 #else
@@ -82,6 +88,11 @@ C10_CUDA_API void __inline__ stream_synchronize(cudaStream_t stream) {
           warning_state().get_sync_debug_mode() != SyncDebugMode::L_DISABLED)) {
     warn_or_error_on_sync();
   }
+  const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+  if (C10_UNLIKELY(interp)) {
+    (*interp)->trace_gpu_stream_synchronization(
+        reinterpret_cast<uintptr_t>(stream));
+  }
   C10_CUDA_CHECK(cudaStreamSynchronize(stream));
 }
 
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index ba24386487f5..9d4aa23a1e0a 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -395,6 +395,7 @@ set(GENERATED_CXX_PYTHON
   "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions.cpp"
   "${TORCH_SRC_DIR}/csrc/autograd/generated/python_fft_functions.cpp"
   "${TORCH_SRC_DIR}/csrc/autograd/generated/python_linalg_functions.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nested_functions.cpp"
   "${TORCH_SRC_DIR}/csrc/autograd/generated/python_sparse_functions.cpp"
   "${TORCH_SRC_DIR}/csrc/autograd/generated/python_special_functions.cpp"
   "${TORCH_SRC_DIR}/csrc/autograd/generated/python_return_types.cpp"
@@ -1031,6 +1032,7 @@ if(TRACING_BASED AND NOT BUILD_LITE_INTERPRETER AND NOT INTERN_BUILD_MOBILE)
     ${TORCH_ROOT}/torch/csrc/jit/mobile/model_tracer
     ${CMAKE_BINARY_DIR}/model_tracer
   )
+  string(APPEND CMAKE_CXX_FLAGS " -DENABLE_RECORD_KERNEL_FUNCTION_DTYPE")
 endif()
 
 # Codegen selected_mobile_ops.h for template selective build
@@ -1243,6 +1245,8 @@ install(FILES
 
 # ---[ Torch python bindings build
 add_subdirectory(../torch torch)
+set(TORCH_PYTHON_COMPILE_OPTIONS ${TORCH_PYTHON_COMPILE_OPTIONS} PARENT_SCOPE)
+set(TORCH_PYTHON_LINK_FLAGS ${TORCH_PYTHON_LINK_FLAGS} PARENT_SCOPE)
 
 # ==========================================================
 # END formerly-libtorch flags
diff --git a/caffe2/contrib/nccl/cuda_nccl_gpu.cc b/caffe2/contrib/nccl/cuda_nccl_gpu.cc
index 82fe52365128..9a5b83c55c66 100644
--- a/caffe2/contrib/nccl/cuda_nccl_gpu.cc
+++ b/caffe2/contrib/nccl/cuda_nccl_gpu.cc
@@ -177,7 +177,7 @@ void runNCCL(const NCCLExecution& ex, InitF&& init_f, F&& f) {
   // Now, wait on all the events in the original stream.
   CUDAGuard dg(ex.stream_gpu_id);
   for (auto& event : events) {
-    CUDA_ENFORCE(cudaStreamWaitEvent(CHECK_NOTNULL(ex.stream), event, 0));
+    CUDA_ENFORCE(cudaStreamWaitEvent(TORCH_CHECK_NOTNULL(ex.stream), event, 0));
   }
 }
 
diff --git a/caffe2/proto/caffe2_pb.h b/caffe2/proto/caffe2_pb.h
index bc6c40a195d5..fc82659dc51d 100644
--- a/caffe2/proto/caffe2_pb.h
+++ b/caffe2/proto/caffe2_pb.h
@@ -108,7 +108,7 @@ inline TORCH_API caffe2::DeviceOption DeviceToOption(const at::Device& device) {
   return option;
 }
 
-inline TORCH_API at::Device OptionToDevice(const caffe2::DeviceOption option) {
+inline TORCH_API at::Device OptionToDevice(const caffe2::DeviceOption& option) {
   auto type = option.device_type();
   c10::DeviceIndex id = -1;
   switch (type) {
diff --git a/cmake/External/apply_nccl_patch.sh b/cmake/External/apply_nccl_patch.sh
new file mode 100755
index 000000000000..7920c1b4c060
--- /dev/null
+++ b/cmake/External/apply_nccl_patch.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+# This patch is required to fix intermittent link errors when building
+# NCCL. See https://github.com/pytorch/pytorch/issues/83790
+
+TORCH_DIR=$1
+
+# Only apply patch if "git status" is empty to avoid failing when the
+# patch has already been applied
+if [[ `git status --porcelain` == "" ]]; then
+    git apply "${TORCH_DIR}/cmake/External/nccl.patch"
+fi
diff --git a/cmake/External/nccl.cmake b/cmake/External/nccl.cmake
index cb928baf3a59..1e7d7e2c4adb 100644
--- a/cmake/External/nccl.cmake
+++ b/cmake/External/nccl.cmake
@@ -50,6 +50,7 @@ if(NOT __NCCL_INCLUDED)
         "BUILDDIR=${__NCCL_BUILD_DIR}"
         "VERBOSE=0"
       BUILD_BYPRODUCTS "${__NCCL_BUILD_DIR}/lib/libnccl_static.a"
+      PATCH_COMMAND "${CMAKE_CURRENT_LIST_DIR}/apply_nccl_patch.sh" "${PROJECT_SOURCE_DIR}"
       INSTALL_COMMAND ""
       )
 
diff --git a/cmake/External/nccl.patch b/cmake/External/nccl.patch
new file mode 100644
index 000000000000..48bb9d9612ab
--- /dev/null
+++ b/cmake/External/nccl.patch
@@ -0,0 +1,46 @@
+diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile
+index 04bce8e..a2498a0 100644
+--- a/src/collectives/device/Makefile
++++ b/src/collectives/device/Makefile
+@@ -29,7 +29,7 @@ all: $(STATICLIB)
+ all_deps: $(DEPENDFILES)
+ 
+ # Auto-generating the rules per op/reduction/datatype/algorithm
+-$(RULESFILE) :
++$(RULESFILE) : gen_rules.sh
+ 	@printf "Generating %-35s > %s\n" rules $@
+ 	@mkdir -p $(OBJDIR)
+ 	@CUDA_MAJOR=${CUDA_MAJOR} CUDA_MINOR=${CUDA_MINOR} ./gen_rules.sh $(OBJDIR) > $@
+diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh
+index aaf3685..73359af 100755
+--- a/src/collectives/device/gen_rules.sh
++++ b/src/collectives/device/gen_rules.sh
+@@ -13,6 +13,9 @@ then
+     datatypes+=" bf16"
+ fi
+ 
++echo "CURDIR := \$(dir \$(realpath \$(word \$(words \$(math\$(MAKEFILE_LIST))-1), \$(MAKEFILE_LIST))))"
++echo ""
++
+ targets="GENOBJS := \\\\\n"
+ 
+ for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do
+@@ -21,10 +24,16 @@ for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do
+     dtn=0
+     # Order must match that of the ncclDataType_t enum
+     for dt in ${datatypes}; do
+-      echo "${dir}/${base}_${op}_${dt}.o : ${base}.cu ${dir}/${base}.dep"
++      # Generate a unique filename for each compilation unit,
++      # otherwise the __nv_module_id may conflict at link time
++      echo "${dir}/${base}_${opn}_${dtn}.cu :"
++      echo "	echo \"#include \\\"\$(CURDIR)${base}.cu\\\"\" > \$@"
++      echo ""
++      # Compile the file
++      echo "${dir}/${base}_${op}_${dt}.o : ${dir}/${base}_${opn}_${dtn}.cu ${base}.cu ${dir}/${base}.dep"
+       echo "	@printf \"Compiling  %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o"
+       echo "	mkdir -p ${dir}"
+-      echo "	\${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc ${base}.cu -o ${dir}/${base}_${op}_${dt}.o"
++      echo "	\${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc \$< -o \$@"
+       echo ""
+       targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n"
+       dtn=$(($dtn + 1))
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 27f8381209e6..a892f550a611 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -58,6 +58,7 @@ function(caffe2_print_configuration_summary)
     message(STATUS "  CROSS_COMPILING_MACOSX : ${CROSS_COMPILING_MACOSX}")
   endif()
   message(STATUS "  INTERN_BUILD_MOBILE   : ${INTERN_BUILD_MOBILE}")
+  message(STATUS "  TRACING_BASED         : ${TRACING_BASED}")
 
   message(STATUS "  USE_BLAS              : ${USE_BLAS}")
   if(${USE_BLAS})
@@ -77,6 +78,7 @@ function(caffe2_print_configuration_summary)
     message(STATUS "    USE_CUDNN           : ${USE_CUDNN}")
     message(STATUS "    USE_EXPERIMENTAL_CUDNN_V8_API: ${USE_EXPERIMENTAL_CUDNN_V8_API}")
     message(STATUS "    CUDA version        : ${CUDA_VERSION}")
+    message(STATUS "    USE_FLASH_ATTENTION : ${USE_FLASH_ATTENTION}")
     if(${USE_CUDNN})
       message(STATUS "    cuDNN version       : ${CUDNN_VERSION}")
     endif()
diff --git a/docs/source/community/governance.rst b/docs/source/community/governance.rst
index cbb8576c89a4..1c616d954ba7 100644
--- a/docs/source/community/governance.rst
+++ b/docs/source/community/governance.rst
@@ -129,7 +129,7 @@ The Process for Nomination
 
 * Each module has its own process. Please contact module maintainers for more information.
   However, if there is no process identified, you can file a request to the core maintainers
-  by submitting [this form](https://forms.gle/xNeu1byGMZVHcA2q7). Core maintainers are
+  by submitting `this form <https://forms.gle/xNeu1byGMZVHcA2q7>`__. Core maintainers are
   meeting every three months.
 * If you are submitting a request to the core maintainers, the information in your request
   must include the following items:
diff --git a/docs/source/community/persons_of_interest.rst b/docs/source/community/persons_of_interest.rst
index ab5ed832bca0..f4e2eb279bf5 100644
--- a/docs/source/community/persons_of_interest.rst
+++ b/docs/source/community/persons_of_interest.rst
@@ -54,13 +54,14 @@ Autograd (torch.autograd)
 -  Jeffrey Wan (`soulitzer <https://github.com/soulitzer>`__)
 -  (emeritus) Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 
-JIT / TorchScript / FX
-~~~~~~~~~~~~~~~~~~~~~~
+Compilers (JIT / TorchScript / FX / TorchDynamo)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 -  Elias Ellison (`eellison <https://github.com/eellison>`__)
 -  Michael Suo (`suo <https://github.com/suo>`__)
 -  Yanan Cao (`gmagogsfm <https://github.com/gmagogsfm>`__)
 -  James Reed (`jamesr66a <https://github.com/jamesr66a>`__)
+-  Jason Ansel (`jansel <https://github.com/jansel>`__)
 -  (emeritus) Zach Devito (`zdevito <https://github.com/zdevito>`__)
 
 
@@ -83,9 +84,10 @@ Distributed
 -  Junjie Wang (`fduwjj <https://github.com/fduwjj>`__)
 -  Howard Huang (`H-Huang <https://github.com/H-Huang>`__)
 -  Tristan Rice (`d4l3k <https://github.com/d4l3k>`__)
--  Kiuk Chung (`kiukchung <https://github.com/kiukchung>`__)
 -  Alisson Azzolini (`aazzolini <https://github.com/aazzolini>`__)
 -  Ke Wen (`kwen2501 <https://github.com/kwen2501>`__)
+-  James Reed (`jamesr66a <https://github.com/jamesr66a>`__)
+-  (emeritus) Kiuk Chung (`kiukchung <https://github.com/kiukchung>`__)
 -  (emeritus) Pieter Noordhuis (`pietern <https://github.com/pietern>`__)
 -  (emeritus) Mingzhe Li (`mingzhe09088 <https://github.com/mingzhe09088>`__)
 -  (emeritus) Omkar Salpekar (`osalpekar <https://github.com/osalpekar>`__)
@@ -105,6 +107,14 @@ Linear Algebra (torch.linalg)
 -  Ivan Yashchuk (`IvanYashchuk <https://github.com/IvanYashchuk>`__)
 -  (emeritus) Vishwak Srinivasan (`vishwakftw <https://github.com/vishwakftw>`__)
 
+Sparse (torch.sparse)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+-  Pearu Peterson (`pearu <https://github.com/pearu>`__)
+-  Nikita Vedeneev (`nikitaved <https://github.com/nikitaved>`__)
+-  Ivan Yashchuk (`IvanYashchuk <https://github.com/IvanYashchuk>`__)
+-  Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
+
 Fast Fourier Transform (torch.fft)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -115,6 +125,7 @@ CPU Performance / SIMD
 ~~~~~~~~~~~~~~~~~~~~~~
 
 -  Vitaly Fedyunin (`VitalyFedyunin <https://github.com/VitalyFedyunin>`__)
+-  Mingfei Ma (`mingfeima <https://github.com/mingfeima>`__)
 -  (emeritus) Xiaoqiang Zheng (`zheng-xq <https://github.com/zheng-xq>`__)
 -  (emeritus) Sam Gross (`colesbury <https://github.com/colesbury>`__)
 -  (emeritus) Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
@@ -126,6 +137,8 @@ NVIDIA / CUDA
 -  Natalia Gimelshein (`ngimel <https://github.com/ngimel>`__)
 -  Edward Yang (`ezyang <https://github.com/ezyang>`__)
 -  Piotr Bialecki (`ptrblck <https://github.com/ptrblck>`__)
+-  Christian Sarofeen (`csarofeen <https://github.com/csarofeen>`__)
+-  Andrew Tulloch (`ajtulloch <https://github.com/ajtulloch>`__)
 -  (emeritus) Xiaoqiang Zheng (`zheng-xq <https://github.com/zheng-xq>`__)
 
 NVFuser
@@ -141,6 +154,7 @@ Intel / MKLDNN
 
 -  Vitaly Fedyunin (`VitalyFedyunin <https://github.com/VitalyFedyunin>`__)
 -  Jianhui Li (`Jianhui-Li <https://github.com/Jianhui-Li>`__)
+-  Mingfei Ma (`mingfeima <https://github.com/mingfeima>`__)
 -  (emeritus) Junjie Bai (`bddppq <https://github.com/bddppq>`__)
 -  (emeritus) Yinghai Lu (`yinghai <https://github.com/yinghai>`__)
 
@@ -277,8 +291,8 @@ TorchVision
 TorchText
 ~~~~~~~~~
 
--  Parmeet Singh Bhatia (`parmeet <https://github.com/parmeet>`__)
--  Steven Liu (`hudeven <https://github.com/hudeven>`__)
+-  Nayef Ahmed (`Nayef211 <https://github.com/Nayef211>`__)
+-  (emeritus) Parmeet Singh Bhatia (`parmeet <https://github.com/parmeet>`__)
 -  (emeritus) Guanheng George Zhang (`zhangguanheng66 <https://github.com/zhangguanheng66>`__)
 -  (emeritus) Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
 
@@ -288,13 +302,19 @@ TorchAudio
 -  Moto Hira (`mthrok <https://github.com/mthrok>`__)
 -  (emeritus) Vincent QB (`vincentqb <https://github.com/vincentqb>`__)
 
+TorchRec
+~~~~~~~~
+
+-  Dmytro Ivchenko (`divchenko <https://github.com/divchenko>`__)
+-  Colin Taylor (`colin2328 <https://github.com/colin2328>`__)
+
 TorchX
 ~~~~~~
 
 -  Tristan Rice (`d4l3k <https://github.com/d4l3k>`__)
 -  Kiuk Chung (`kiukchung <https://github.com/kiukchung>`__)
 
-TorchData
-~~~~~~~~~
+TorchData / TorchArrow
+~~~~~~~~~~~~~~~~~~~~~~
 -  Vitaly Fedyunin (`VitalyFedyunin <https://github.com/VitalyFedyunin>`__)
 -  Wenlei Xie (`wenleix <https://github.com/wenleix>`__)
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 5933b1fb696d..f8f40a1c749d 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -96,6 +96,10 @@
     "check_error",
     "cudart",
     "is_bf16_supported",
+    # torch.cuda._sanitizer
+    "format_log_message",
+    "zip_arguments",
+    "zip_by_key",
     # torch.distributed.autograd
     "is_available",
     # torch.distributed.elastic.events
@@ -266,6 +270,15 @@
     "ShortStorage",
     "ShortTensor",
     "cudaStatus",
+    # torch.cuda._sanitizer
+    "Access",
+    "AccessType",
+    "CUDASanitizer",
+    "CUDASanitizerDispatchMode",
+    "CUDASanitizerErrors",
+    "EventHandler",
+    "SynchronizationError",
+    "UnsynchronizedAccessError",
     # torch.distributed.elastic.multiprocessing.errors
     "ChildFailedError",
     "ProcessFailure",
diff --git a/docs/source/cuda._sanitizer.rst b/docs/source/cuda._sanitizer.rst
new file mode 100644
index 000000000000..68ba1ca1a0b2
--- /dev/null
+++ b/docs/source/cuda._sanitizer.rst
@@ -0,0 +1,102 @@
+.. currentmodule:: torch.cuda._sanitizer
+
+CUDA Stream Sanitizer
+=====================
+
+.. note::
+    This is a prototype feature, which means it is at an early stage
+    for feedback and testing, and its components are subject to change.
+
+Overview
+--------
+
+.. automodule:: torch.cuda._sanitizer
+
+
+Usage
+------
+
+Here is an example of a simple synchronization error in PyTorch:
+
+::
+
+    import torch
+
+    a = torch.rand(4, 2, device="cuda")
+
+    with torch.cuda.stream(torch.cuda.Stream()):
+        torch.mul(a, 5, out=a)
+
+The ``a`` tensor is initialized on the default stream and, without any synchronization
+methods, modified on a new stream. The two kernels will run concurrently on the same tensor,
+which might cause the second kernel to read unitialized data before the first one was able
+to write it, or the first kernel might overwrite part of the result of the second.
+When this script is run on the commandline with:
+::
+
+    TORCH_CUDA_SANITIZER=1 python example_error.py
+
+the following output is printed by CSAN:
+
+::
+
+    ============================
+    CSAN detected a possible data race on tensor with data pointer 139719969079296
+    Access by stream 94646435460352 during kernel:
+    aten::mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    writing to argument: self, out, output
+    With stack trace:
+      File "example_error.py", line 6, in <module>
+        torch.mul(a, 5, out=a)
+      ...
+      File "pytorch/torch/cuda/_sanitizer.py", line 364, in _handle_kernel_launch
+        stack_trace = traceback.StackSummary.extract(
+
+    Previous access by stream 0 during kernel:
+    aten::rand(int[] size, *, int? dtype=None, Device? device=None) -> Tensor
+    writing to argument: output
+    With stack trace:
+      File "example_error.py", line 3, in <module>
+        a = torch.rand(10000, device="cuda")
+      ...
+      File "pytorch/torch/cuda/_sanitizer.py", line 364, in _handle_kernel_launch
+        stack_trace = traceback.StackSummary.extract(
+
+    Tensor was allocated with stack trace:
+      File "example_error.py", line 3, in <module>
+        a = torch.rand(10000, device="cuda")
+      ...
+      File "pytorch/torch/cuda/_sanitizer.py", line 420, in _handle_memory_allocation
+        traceback.StackSummary.extract(
+
+This gives extensive insight into the origin of the error:
+
+- A tensor was incorrectly accessed from streams with ids: 0 (default stream) and 94646435460352 (new stream)
+- The tensor was allocated by invoking ``a = torch.rand(10000, device="cuda")``
+- The faulty accesses were caused by operators
+    - ``a = torch.rand(10000, device="cuda")`` on stream 0
+    - ``torch.mul(a, 5, out=a)`` on stream 94646435460352
+- The error message also displays the schemas of the invoked operators, along with a note
+  showing which arguments of the operators correspond to the affected tensor.
+
+  - In the example, it can be seen that tensor ``a`` corresponds to arguments ``self``, ``out``
+    and the ``output`` value of the invoked operator ``torch.mul``.
+
+.. seealso::
+    The list of supported torch operators and their schemas can be viewed
+    :doc:`here <torch>`.
+
+The bug can be fixed by forcing the new stream to wait for the default stream:
+
+::
+
+    with torch.cuda.stream(torch.cuda.Stream()):
+        torch.cuda.current_stream().wait_stream(torch.cuda.default_stream())
+        torch.mul(a, 5, out=a)
+
+When the script is run again, there are no errors reported.
+
+API Reference
+-------------
+
+.. autofunction:: enable_cuda_sanitizer
diff --git a/docs/source/cuda.rst b/docs/source/cuda.rst
index 02c3b407aa21..9eb884d13d4e 100644
--- a/docs/source/cuda.rst
+++ b/docs/source/cuda.rst
@@ -134,3 +134,14 @@ Jiterator (beta)
 
     jiterator._create_jit_fn
     jiterator._create_multi_output_jit_fn
+
+Stream Sanitizer (prototype)
+----------------------------
+
+CUDA Sanitizer is a prototype tool for detecting synchronization errors between streams in PyTorch.
+See the :doc:`documentation <cuda._sanitizer>` for information on how to use it.
+
+.. toctree::
+    :hidden:
+
+    cuda._sanitizer
diff --git a/docs/source/index.rst b/docs/source/index.rst
index f688cbe0134f..cd65126d393e 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -92,7 +92,7 @@ Features described in this documentation are classified by release status:
    rpc
    torch.random <random>
    masked
-   nested
+   torch.nested <nested>
    sparse
    storage
    torch.testing <testing>
diff --git a/docs/source/nested.rst b/docs/source/nested.rst
index 9ad43322d196..3fed648beac6 100644
--- a/docs/source/nested.rst
+++ b/docs/source/nested.rst
@@ -142,14 +142,11 @@ nested_tensor([
 
 Note that nt.unbind()[0] is not a, but rather a slice of the underlying memory, which represents the first entry or constituent of the NestedTensor.
 
-Nested tensor methods
+Nested tensor functions
 +++++++++++++++++++++++++
 
-The following Tensor methods are related to nested tensors:
+The following functions are related to nested tensors:
 
-.. currentmodule:: torch
-.. autosummary::
-    :toctree: generated
-    :nosignatures:
+.. currentmodule:: torch.nested
 
-    Tensor.to_padded_tensor
+.. autofunction:: to_padded_tensor
diff --git a/docs/source/onnx_supported_aten_ops.rst b/docs/source/onnx_supported_aten_ops.rst
index d6bf535e2e7e..ce075b59a7bd 100644
--- a/docs/source/onnx_supported_aten_ops.rst
+++ b/docs/source/onnx_supported_aten_ops.rst
@@ -1,14 +1,30 @@
 :orphan:
 
-ONNX supported ATen operators
-=============================
+ONNX supported TorchScript operators
+====================================
 
-This file is automatically generated during the documentation build
-by cross referencing ONNX operator symbolics with Torch JIT operators via
-``docs/source/scripts/build_onnx_supported_aten_op_csv_table.py``.
-Do not modify directly and instead `rebuild the docs <https://github.com/pytorch/pytorch#building-the-documentation>`_.
+.. This file is automatically generated during the documentation build
+.. by cross referencing ONNX operator symbolics with TorchScript operators via
+.. ``docs/source/scripts/build_onnx_supported_aten_op_csv_table.py``.
+.. Do not modify directly and instead `rebuild the docs <https://github.com/pytorch/pytorch#building-the-documentation>`_.
 
-.. csv-table:: Supported ATen operators
-   :file: ../build/auto_gen_aten_op_list.csv
-   :widths: 30, 70
+This page lists the TorchScript operators that are supported/unsupported by ONNX export.
+
+Supported operators
+-------------------
+
+.. csv-table:: ONNX support for TorchScript operators
+   :file: ../build/onnx/auto_gen_supported_op_list.csv
+   :widths: 70, 30
+   :header-rows: 1
+
+
+Unsupported operators
+---------------------
+
+Operators that are not yet supported
+
+.. csv-table:: Unsupported operators
+   :file: ../build/onnx/auto_gen_unsupported_op_list.csv
+   :widths: 70, 30
    :header-rows: 1
diff --git a/docs/source/quantization-support.rst b/docs/source/quantization-support.rst
index aa8d4e1ec93c..4342901736f4 100644
--- a/docs/source/quantization-support.rst
+++ b/docs/source/quantization-support.rst
@@ -2,7 +2,7 @@ Quantization API Reference
 -------------------------------
 
 torch.quantization
-~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~
 
 This module contains Eager mode quantization APIs.
 
@@ -52,7 +52,7 @@ Utility functions
     get_observer_dict
 
 torch.quantization.quantize_fx
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 This module contains FX graph mode quantization APIs (prototype).
 
@@ -68,6 +68,59 @@ This module contains FX graph mode quantization APIs (prototype).
     convert_fx
     fuse_fx
 
+torch.ao.quantization.qconfig_mapping
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This module contains QConfigMapping for configuring FX graph mode quantization.
+
+.. currentmodule:: torch.ao.quantization.qconfig_mapping
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    QConfigMapping
+    get_default_qconfig_mapping
+    get_default_qat_qconfig_mapping
+
+torch.ao.quantization.backend_config
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This module contains BackendConfig, a config object that defines how quantization is supported
+in a backend. Currently only used by FX Graph Mode Quantization, but we may extend Eager Mode
+Quantization to work with this as well.
+
+.. currentmodule:: torch.ao.quantization.backend_config
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    BackendConfig
+    BackendPatternConfig
+    DTypeConfig
+    ObservationType
+
+torch.ao.quantization.fx.custom_config
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This module contains a few CustomConfig classes that's used in both eager mode and FX graph mode quantization
+
+
+.. currentmodule:: torch.ao.quantization.fx.custom_config
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    FuseCustomConfig
+    PrepareCustomConfig
+    ConvertCustomConfig
+    StandaloneModuleConfigEntry
+
 torch (quantization related functions)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -127,7 +180,7 @@ regular full-precision tensor.
 
 
 torch.quantization.observer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 This module contains observers which are used to collect statistics about
 the values observed during calibration (PTQ) or training (QAT).
@@ -160,7 +213,7 @@ the values observed during calibration (PTQ) or training (QAT).
     default_float_qparams_observer
 
 torch.quantization.fake_quantize
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 This module implements modules which are used to perform fake quantization
 during QAT.
@@ -189,7 +242,7 @@ during QAT.
     enable_observer
 
 torch.quantization.qconfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 This module defines `QConfig` objects which are used
 to configure quantization settings for individual ops.
@@ -537,3 +590,8 @@ the `custom operator mechanism <https://pytorch.org/tutorials/advanced/torch_scr
 .. automodule:: torch.nn.quantizable.modules
 .. automodule:: torch.nn.quantized
    :noindex:
+
+.. automodule:: torch.ao.nn.quantized.reference
+   :noindex:
+.. automodule:: torch.ao.nn.quantized.reference.modules
+   :noindex:
diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index ae53d149a4ef..4955b58cbfe2 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -1156,3 +1156,6 @@ Please take a look at `Limitations of Symbolic Tracing <https://docs-preview.pyt
 .. py:module:: torch.nn.quantized.modules
 .. py:module:: torch.nn.quantized.dynamic
 .. py:module:: torch.nn.quantized.dynamic.modules
+
+.. py:module:: torch.ao.nn.quantized.reference
+.. py:module:: torch.ao.nn.quantized.reference.modules
diff --git a/docs/source/scripts/onnx/build_onnx_supported_aten_op_csv_table.py b/docs/source/scripts/onnx/build_onnx_supported_aten_op_csv_table.py
index 31bc2e8c8480..e4b4eb11fdfc 100644
--- a/docs/source/scripts/onnx/build_onnx_supported_aten_op_csv_table.py
+++ b/docs/source/scripts/onnx/build_onnx_supported_aten_op_csv_table.py
@@ -8,18 +8,59 @@
 from torch.onnx import _onnx_supported_ops
 
 # Constants
-BUILD_DIR = "build"
-AUTO_GEN_ATEN_OPS_CSV_FILE = "auto_gen_aten_op_list.csv"
+BUILD_DIR = "build/onnx"
+SUPPORTED_OPS_CSV_FILE = "auto_gen_supported_op_list.csv"
+UNSUPPORTED_OPS_CSV_FILE = "auto_gen_unsupported_op_list.csv"
+
+
+def _sort_key(namespaced_opname):
+    return tuple(reversed(namespaced_opname.split("::")))
+
+
+def _get_op_lists():
+    all_schemas = _onnx_supported_ops.all_forward_schemas()
+    symbolic_schemas = _onnx_supported_ops.all_symbolics_schemas()
+    supported_result = set()
+    not_supported_result = set()
+    for opname in all_schemas:
+        if opname.endswith("_"):
+            opname = opname[:-1]
+        if opname in symbolic_schemas:
+            # Supported op
+            opsets = symbolic_schemas[opname].opsets
+            supported_result.add(
+                (
+                    opname,
+                    f"Since opset {opsets[0]}",
+                )
+            )
+        else:
+            # Unsupported op
+            not_supported_result.add(
+                (
+                    opname,
+                    "Not yet supported",
+                )
+            )
+    return (
+        sorted(supported_result, key=lambda x: _sort_key(x[0])),
+        sorted(not_supported_result),
+    )
 
 
 def main():
     os.makedirs(BUILD_DIR, exist_ok=True)
 
-    aten_list = _onnx_supported_ops.onnx_supported_ops()
+    supported, unsupported = _get_op_lists()
+
+    with open(os.path.join(BUILD_DIR, SUPPORTED_OPS_CSV_FILE), "w") as f:
+        f.write("Operator,opset_version(s)\n")
+        for name, opset_version in supported:
+            f.write(f'"``{name}``","{opset_version}"\n')
 
-    with open(os.path.join(BUILD_DIR, AUTO_GEN_ATEN_OPS_CSV_FILE), "w") as f:
+    with open(os.path.join(BUILD_DIR, UNSUPPORTED_OPS_CSV_FILE), "w") as f:
         f.write("Operator,opset_version(s)\n")
-        for name, opset_version in aten_list:
+        for name, opset_version in unsupported:
             f.write(f'"``{name}``","{opset_version}"\n')
 
 
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index 9c4264316fd1..467a26a02dfe 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -345,7 +345,6 @@ Tensor class reference
     Tensor.dot
     Tensor.double
     Tensor.dsplit
-    Tensor.eig
     Tensor.element_size
     Tensor.eq
     Tensor.eq_
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index a530c5af136f..147795e7dea5 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -86,6 +86,7 @@ Indexing, Slicing, Joining, Mutating Ops
     argwhere
     cat
     concat
+    concatenate
     conj
     chunk
     dsplit
@@ -558,7 +559,6 @@ BLAS and LAPACK Operations
     cholesky_inverse
     cholesky_solve
     dot
-    eig
     geqrf
     ger
     inner
diff --git a/functorch/.circleci/config.yml b/functorch/.circleci/config.yml
deleted file mode 100644
index bab6defefd24..000000000000
--- a/functorch/.circleci/config.yml
+++ /dev/null
@@ -1,316 +0,0 @@
-version: 2.1
-
-executors:
-  windows-cpu:
-    machine:
-      resource_class: windows.xlarge
-      image: windows-server-2019-vs2019:stable
-      shell: bash.exe
-
-  windows-gpu:
-    machine:
-      resource_class: windows.gpu.nvidia.medium
-      image: windows-server-2019-nvidia:stable
-      shell: bash.exe
-
-commands:
-  checkout_merge:
-    description: "checkout merge branch"
-    steps:
-      - checkout
-  designate_upload_channel:
-    description: "inserts the correct upload channel into ${BASH_ENV}"
-    steps:
-      - run:
-          name: adding UPLOAD_CHANNEL to BASH_ENV
-          command: |
-            our_upload_channel=nightly
-            # On tags upload to test instead
-            if [[ -n "${CIRCLE_TAG}" ]]; then
-              our_upload_channel=test
-            fi
-            echo "export UPLOAD_CHANNEL=${our_upload_channel}" >> ${BASH_ENV}
-
-binary_common: &binary_common
-  parameters:
-    # Edit these defaults to do a release`
-    build_version:
-      description: "version number of release binary; by default, build a nightly"
-      type: string
-      default: ""
-    pytorch_version:
-      description: "PyTorch version to build against; by default, use a nightly"
-      type: string
-      default: ""
-    # Don't edit these
-    python_version:
-      description: "Python version to build against (e.g., 3.7)"
-      type: string
-    cu_version:
-      description: "CUDA version to build against, in CU format (e.g., cpu or cu100)"
-      type: string
-    unicode_abi:
-      description: "Python 2.7 wheel only: whether or not we are cp27mu (default: no)"
-      type: string
-      default: ""
-    wheel_docker_image:
-      description: "Wheel only: what docker image to use"
-      type: string
-      default: "pytorch/manylinux-cuda101"
-  environment:
-    PYTHON_VERSION: << parameters.python_version >>
-    PYTORCH_VERSION: << parameters.pytorch_version >>
-    UNICODE_ABI: << parameters.unicode_abi >>
-    CU_VERSION: << parameters.cu_version >>
-
-jobs:
-  unittest_linux_cpu:
-    <<: *binary_common
-    machine:
-      image: "ubuntu-2004:202104-01"
-    resource_class: xlarge
-    steps:
-      - checkout
-      - run:
-          name: Setup
-          command: |
-            touch ${BASH_ENV}
-            echo "export PARAMETERS_PYTHON_VERSION=<< parameters.python_version >>" >> ${BASH_ENV}
-            cat ${BASH_ENV}
-            # For some reason circleci isn't automatically sourcing this within the builds
-            source ${BASH_ENV} && .circleci/unittest/linux/scripts/setup_env.sh
-      - run:
-          name: Install functorch
-          command: |
-            touch ${BASH_ENV}
-            echo "export PARAMETERS_PYTHON_VERSION=<< parameters.python_version >>" >> ${BASH_ENV}
-            cat ${BASH_ENV}
-            # For some reason circleci isn't automatically sourcing this within the builds
-            source ${BASH_ENV} && .circleci/unittest/linux/scripts/install.sh
-      - persist_to_workspace:
-          root: wheels
-          paths:
-            - "*"
-      - store_artifacts:
-          path: wheels
-      - run:
-          name: Run tests
-          command: .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-reports
-
-  unittest_linux_gpu:
-    <<: *binary_common
-    machine:
-      # https://circleci.com/docs/2.0/configuration-reference/#available-linux-gpu-images
-      image: ubuntu-2004-cuda-11.4:202110-01
-    resource_class: gpu.nvidia.medium
-    steps:
-      - checkout
-      - run:
-          name: Setup
-          command: |
-            touch ${BASH_ENV}
-            echo "export PARAMETERS_PYTHON_VERSION=<< parameters.python_version >>" >> ${BASH_ENV}
-            cat ${BASH_ENV}
-            # For some reason circleci isn't automatically sourcing this within the builds
-            source ${BASH_ENV} && .circleci/unittest/linux/scripts/setup_env.sh
-      - run:
-          name: Install functorch
-          command: |
-            touch ${BASH_ENV}
-            echo "export PARAMETERS_PYTHON_VERSION=<< parameters.python_version >>" >> ${BASH_ENV}
-            cat ${BASH_ENV}
-            # For some reason circleci isn't automatically sourcing this within the builds
-            source ${BASH_ENV} && .circleci/unittest/linux/scripts/install.sh
-      - persist_to_workspace:
-          root: wheels
-          paths:
-            - "*"
-      - store_artifacts:
-          path: wheels
-      - run:
-          name: Run tests
-          command: .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-reports
-
-  unittest_macos_cpu:
-    <<: *binary_common
-    macos:
-      xcode: "12.0"
-    resource_class: large
-    steps:
-      - checkout
-      - run:
-          name: Install wget
-          command: HOMEBREW_NO_AUTO_UPDATE=1 brew install wget
-          # Disable brew auto update which is very slow
-      - run:
-          name: Setup
-          command: |
-            touch ${BASH_ENV}
-            echo "export PARAMETERS_PYTHON_VERSION=<< parameters.python_version >>" >> ${BASH_ENV}
-            cat ${BASH_ENV}
-            # For some reason circleci isn't automatically sourcing this within the builds
-            source ${BASH_ENV} && .circleci/unittest/linux/scripts/setup_env.sh
-      - run:
-          name: Install functorch
-          command: .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  unittest_windows_cpu:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-          keys:
-            - env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-      - run:
-          name: Setup
-          command: .circleci/unittest/windows/scripts/setup_env.sh
-      - save_cache:
-          key: env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install functorch
-          command: .circleci/unittest/windows/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/windows/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/windows/scripts/post_process.sh
-      - store_test_results:
-          path: test-reports
-
-  unittest_windows_gpu:
-    <<: *binary_common
-    executor:
-      name: windows-gpu
-    environment:
-      CUDA_VERSION: "11.3"
-      PYTHON_VERSION: << parameters.python_version >>
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-          keys:
-            - env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-      - run:
-          name: Setup
-          command: .circleci/unittest/windows/scripts/setup_env.sh
-      - save_cache:
-          key: env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install CUDA
-          command: packaging/windows/internal/cuda_install.bat
-      - run:
-          name: Update CUDA driver
-          command: packaging/windows/internal/driver_update.bat
-      - run:
-          name: Install functorch
-          command: .circleci/unittest/windows/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/windows/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/windows/scripts/post_process.sh
-      - store_test_results:
-          path: test-reports
-
-  binary_win_wheel:
-    <<: *binary_common
-    executor: windows-cpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build wheel packages
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/windows/internal/cuda_install.bat
-            packaging/build_wheel.sh
-      - store_artifacts:
-          path: dist
-      - persist_to_workspace:
-          root: dist
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-workflows:
-  unittest:
-    jobs:
-      - unittest_linux_cpu:
-          name: unittest_linux_<< matrix.cu_version >>_py<< matrix.python_version >>
-          matrix:
-            parameters:
-              python_version: ["3.7", "3.8", "3.9", "3.10"]
-              cu_version: ["cpu"]
-      - unittest_linux_gpu:
-          name: unittest_linux_<< matrix.cu_version >>_py<< matrix.python_version >>
-          matrix:
-            parameters:
-              python_version: ["3.7", "3.8", "3.9", "3.10"]
-              cu_version: ["cu102"]
-
-      - unittest_macos_cpu:
-          name: unittest_macos_<< matrix.cu_version >>_py<< matrix.python_version >>
-          matrix:
-            parameters:
-              python_version: ["3.10"]
-              cu_version: ["cpu"]
-
-      - unittest_windows_cpu:
-          name: unittest_windows_<< matrix.cu_version >>_py<< matrix.python_version >>
-          matrix:
-            parameters:
-              python_version: ["3.9"]
-              cu_version: ["cpu"]
-
-      - unittest_windows_gpu:
-          name: unittest_windows_<< matrix.cu_version >>_py<< matrix.python_version >>
-          matrix:
-            parameters:
-              python_version: ["3.10"]
-              cu_version: ["cu113"]
-
-      - binary_win_wheel:
-          name: binary_win_wheel_<< matrix.cu_version >>_py<< matrix.python_version >>
-          matrix:
-            parameters:
-              python_version: ["3.7", "3.8", "3.9", "3.10"]
-              cu_version: ["cpu"]
diff --git a/functorch/.circleci/unittest/linux/scripts/environment.yml b/functorch/.circleci/unittest/linux/scripts/environment.yml
deleted file mode 100644
index 2b3e5d43683a..000000000000
--- a/functorch/.circleci/unittest/linux/scripts/environment.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-channels:
-  - defaults
-dependencies:
-  - numpy
-  - pytest
-  - pytest-cov
-  - codecov
-  - pip
-  - ca-certificates
-  - pyyaml
-  - pip:
-      - unittest-xml-reporting
-      - pillow>=4.1.1
-      - scipy
-      - av
-      - networkx
-      - ninja
diff --git a/functorch/.circleci/unittest/linux/scripts/install.sh b/functorch/.circleci/unittest/linux/scripts/install.sh
deleted file mode 100755
index c6f7272b1c6f..000000000000
--- a/functorch/.circleci/unittest/linux/scripts/install.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/usr/bin/env bash
-set -x
-set -e
-
-unset PYTORCH_VERSION
-# For unittest, nightly PyTorch is used as the following section,
-# so no need to set PYTORCH_VERSION.
-# In fact, keeping PYTORCH_VERSION forces us to hardcode PyTorch version in config.
-
-set -e
-
-eval "$(./conda/bin/conda shell.bash hook)"
-conda activate ./env
-
-# if [ "${CU_VERSION:-}" == cpu ] ; then
-#     cudatoolkit="cpuonly"
-# else
-#     if [[ ${#CU_VERSION} -eq 4 ]]; then
-#         CUDA_VERSION="${CU_VERSION:2:1}.${CU_VERSION:3:1}"
-#     elif [[ ${#CU_VERSION} -eq 5 ]]; then
-#         CUDA_VERSION="${CU_VERSION:2:2}.${CU_VERSION:4:1}"
-#     fi
-#     echo "Using CUDA $CUDA_VERSION as determined by CU_VERSION"
-#     version="$(python -c "print('.'.join(\"${CUDA_VERSION}\".split('.')[:2]))")"
-#     cudatoolkit="cudatoolkit=${version}"
-# fi
-
-WHEELS_FOLDER=${HOME}/project/wheels
-mkdir -p $WHEELS_FOLDER
-
-PYVSHORT=${PARAMETERS_PYTHON_VERSION:0:1}${PARAMETERS_PYTHON_VERSION:2:1}
-
-if [[ "$PYVSHORT" == "38" ]] ; then
-   PYVSHORT=cp${PYVSHORT}-cp${PYVSHORT}
-else
-   PYVSHORT=cp${PYVSHORT}-cp${PYVSHORT}m
-fi
-
-# if [ "${CU_VERSION:-}" == cpu ] ; then
-#     pip install https://download.pytorch.org/whl/nightly/cpu/torch-1.9.0.dev20210427%2Bcpu-${PYVSHORT}-linux_x86_64.whl
-#     pip install https://download.pytorch.org/whl/nightly/cpu/torchvision-0.10.0.dev20210427%2Bcpu-${PYVSHORT}-linux_x86_64.whl
-#     USE_NINJA=1 python setup.py develop bdist_wheel -d $WHEELS_FOLDER
-# else
-#     pip install https://download.pytorch.org/whl/nightly/cu102/torch-1.9.0.dev20210427%2Bcu102-${PYVSHORT}-linux_x86_64.whl
-#     pip install https://download.pytorch.org/whl/nightly/cu102/torchvision-0.10.0.dev20210427-${PYVSHORT}-linux_x86_64.whl
-#     USE_NINJA=1 python setup.py develop bdist_wheel -d $WHEELS_FOLDER
-# fi
-
-gcc --version
-
-# TODO: This should really be a part of environment.yml or the docker image.
-# expecttest isn't on conda so it can't be a part of environment.yml :/
-pip install expecttest
-
-if [ "${CU_VERSION:-}" == cpu ] ; then
-    conda install -y pytorch torchvision cpuonly -c pytorch-nightly
-    PYTORCH_VERSION="$(python -c "import torch; print(torch.__version__)")" python setup.py develop bdist_wheel -d $WHEELS_FOLDER
-else
-    conda install -y pytorch torchvision cudatoolkit=10.2 -c pytorch-nightly
-    PYTORCH_VERSION="$(python -c "import torch; print(torch.__version__)")" python setup.py develop bdist_wheel -d $WHEELS_FOLDER
-fi
diff --git a/functorch/.circleci/unittest/linux/scripts/post_process.sh b/functorch/.circleci/unittest/linux/scripts/post_process.sh
deleted file mode 100755
index a84a0dea55e0..000000000000
--- a/functorch/.circleci/unittest/linux/scripts/post_process.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-eval "$(./conda/bin/conda shell.bash hook)"
-conda activate ./env
-
-codecov
diff --git a/functorch/.circleci/unittest/linux/scripts/run_test.sh b/functorch/.circleci/unittest/linux/scripts/run_test.sh
deleted file mode 100755
index 9d5eaa5a136f..000000000000
--- a/functorch/.circleci/unittest/linux/scripts/run_test.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-export IN_CI=1
-mkdir test-reports
-eval "$(./conda/bin/conda shell.bash hook)"
-conda activate ./env
-
-python -m torch.utils.collect_env
-
-# test_functorch_lagging_op_db.py: Only run this locally because it checks
-# the functorch lagging op db vs PyTorch's op db.
-EXIT_STATUS=0
-find test \( -name test\*.py ! -name test_functorch_lagging_op_db.py \) | xargs -I {} -n 1 python {} -v || EXIT_STATUS=$?
-exit $EXIT_STATUS
diff --git a/functorch/.circleci/unittest/linux/scripts/setup_env.sh b/functorch/.circleci/unittest/linux/scripts/setup_env.sh
deleted file mode 100755
index bbc1a4c24970..000000000000
--- a/functorch/.circleci/unittest/linux/scripts/setup_env.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env bash
-set -x
-set -e
-
-# This script is for setting up environment in which unit test is ran.
-# To speed up the CI time, the resulting environment is cached.
-#
-# Do not install PyTorch and functorch here, otherwise they also get cached.
-
-this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-root_dir="$(git rev-parse --show-toplevel)"
-conda_dir="${root_dir}/conda"
-env_dir="${root_dir}/env"
-
-cd "${root_dir}"
-
-case "$(uname -s)" in
-    Darwin*) os=MacOSX;;
-    *) os=Linux
-esac
-
-# 1. Install conda at ./conda
-if [ ! -d "${conda_dir}" ]; then
-    printf "* Installing conda\n"
-    wget -O miniconda.sh http://repo.continuum.io/miniconda/Miniconda3-latest-${os}-x86_64.sh
-    bash ./miniconda.sh -b -f -p "${conda_dir}"
-fi
-eval "$(${conda_dir}/bin/conda shell.bash hook)"
-
-# 2. Create test environment at ./env
-if [ ! -d "${env_dir}" ]; then
-    printf "* Creating a test environment\n"
-    conda create --prefix "${env_dir}" -y python="$PARAMETERS_PYTHON_VERSION"
-fi
-conda activate "${env_dir}"
-
-# 3. Install Conda dependencies
-printf "* Installing dependencies (except PyTorch)\n"
-conda env update --file "${this_dir}/environment.yml" --prune
diff --git a/functorch/.circleci/unittest/windows/scripts/environment.yml b/functorch/.circleci/unittest/windows/scripts/environment.yml
deleted file mode 100644
index 265589019d3f..000000000000
--- a/functorch/.circleci/unittest/windows/scripts/environment.yml
+++ /dev/null
@@ -1,20 +0,0 @@
-channels:
-  - pytorch
-  - defaults
-dependencies:
-  - numpy
-  - pytest
-  - pytest-cov
-  - xdoctest
-  - codecov
-  - pip
-  - pyyaml
-  - ca-certificates
-  - pip:
-      - unittest-xml-reporting
-      - pillow>=4.1.1
-      - scipy
-      - av
-      - networkx
-      - expecttest
-      - ninja
diff --git a/functorch/.circleci/unittest/windows/scripts/install.sh b/functorch/.circleci/unittest/windows/scripts/install.sh
deleted file mode 100644
index d425b2b7133b..000000000000
--- a/functorch/.circleci/unittest/windows/scripts/install.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/env bash
-
-unset PYTORCH_VERSION
-# For unittest, nightly PyTorch is used as the following section,
-# so no need to set PYTORCH_VERSION.
-# In fact, keeping PYTORCH_VERSION forces us to hardcode PyTorch version in config.
-
-set -ex
-
-this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')"
-conda activate ./env
-
-# TODO, refactor the below logic to make it easy to understand how to get correct cuda_version.
-if [ "${CU_VERSION:-}" == cpu ] ; then
-    cudatoolkit="cpuonly"
-    version="cpu"
-else
-    if [[ ${#CU_VERSION} -eq 4 ]]; then
-        CUDA_VERSION="${CU_VERSION:2:1}.${CU_VERSION:3:1}"
-    elif [[ ${#CU_VERSION} -eq 5 ]]; then
-        CUDA_VERSION="${CU_VERSION:2:2}.${CU_VERSION:4:1}"
-    fi
-    echo "Using CUDA $CUDA_VERSION as determined by CU_VERSION"
-    version="$(python -c "print('.'.join(\"${CUDA_VERSION}\".split('.')[:2]))")"
-    cudatoolkit="cudatoolkit=${version}"
-fi
-
-printf "Installing PyTorch with %s\n" "${cudatoolkit}"
-conda install -y -c "pytorch-${UPLOAD_CHANNEL}" -c nvidia "pytorch-${UPLOAD_CHANNEL}"::pytorch[build="*${version}*"] "${cudatoolkit}"
-
-torch_cuda=$(python -c "import torch; print(torch.cuda.is_available())")
-echo torch.cuda.is_available is $torch_cuda
-
-if [ ! -z "${CUDA_VERSION:-}" ] ; then
-    if [ "$torch_cuda" == "False" ]; then
-        echo "torch with cuda installed but torch.cuda.is_available() is False"
-        exit 1
-    fi
-fi
-
-source "$this_dir/set_cuda_envs.sh"
-
-printf "* Installing functorch\n"
-"$this_dir/vc_env_helper.bat" python setup.py develop
diff --git a/functorch/.circleci/unittest/windows/scripts/install_conda.bat b/functorch/.circleci/unittest/windows/scripts/install_conda.bat
deleted file mode 100644
index 6052ad08b106..000000000000
--- a/functorch/.circleci/unittest/windows/scripts/install_conda.bat
+++ /dev/null
@@ -1 +0,0 @@
-start /wait "" "%miniconda_exe%" /S /InstallationType=JustMe /RegisterPython=0 /AddToPath=0 /D=%tmp_conda%
diff --git a/functorch/.circleci/unittest/windows/scripts/post_process.sh b/functorch/.circleci/unittest/windows/scripts/post_process.sh
deleted file mode 100644
index 5c5cbb758a9e..000000000000
--- a/functorch/.circleci/unittest/windows/scripts/post_process.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')"
-conda activate ./env
diff --git a/functorch/.circleci/unittest/windows/scripts/run_test.sh b/functorch/.circleci/unittest/windows/scripts/run_test.sh
deleted file mode 100644
index 8435aa5c955d..000000000000
--- a/functorch/.circleci/unittest/windows/scripts/run_test.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-export IN_CI=1
-mkdir test-reports
-
-eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')"
-conda activate ./env
-
-this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-source "$this_dir/set_cuda_envs.sh"
-
-python -m torch.utils.collect_env
-
-EXIT_STATUS=0
-# TODO: we should be able to acquire the following from some bash commands
-# Tests currently ordered in order of runtime...
-python test/test_eager_transforms.py -v || EXIT_STATUS=$?
-python test/test_compile_cache.py -v || EXIT_STATUS=$?
-python test/test_minifier.py -v || EXIT_STATUS=$?
-python test/test_memory_efficient_fusion.py -v || EXIT_STATUS=$?
-python test/test_pythonkey.py -v || EXIT_STATUS=$?
-python test/test_vmap.py -v || EXIT_STATUS=$?
-python test/test_ops.py -v || EXIT_STATUS=$?
-exit $EXIT_STATUS
diff --git a/functorch/.circleci/unittest/windows/scripts/set_cuda_envs.sh b/functorch/.circleci/unittest/windows/scripts/set_cuda_envs.sh
deleted file mode 100644
index 7db3137b5944..000000000000
--- a/functorch/.circleci/unittest/windows/scripts/set_cuda_envs.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env bash
-set -ex
-
-echo CU_VERSION is "${CU_VERSION}"
-echo CUDA_VERSION is "${CUDA_VERSION}"
-
-# Currenly, CU_VERSION and CUDA_VERSION are not consistent.
-# to understand this code, see https://github.com/pytorch/vision/issues/4443
-version="cpu"
-if [[ ! -z "${CUDA_VERSION}" ]] ; then
-    version="$CUDA_VERSION"
-else
-    if [[ ${#CU_VERSION} -eq 5 ]]; then
-        version="${CU_VERSION:2:2}.${CU_VERSION:4:1}"
-    fi
-fi
-
-# Don't use if [[ "$version" == "cpu" ]]; then exit 0 fi.
-# It would exit the shell. One result is cpu tests would not run if the shell exit.
-# Unless there's an error, Don't exit.
-if [[ "$version" != "cpu" ]]; then
-    # set cuda envs
-    export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/bin:/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/libnvvp:$PATH"
-    export CUDA_PATH_V${version/./_}="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}"
-    export CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}"
-
-    if  [ ! -d "$CUDA_PATH" ]; then
-        echo "$CUDA_PATH" does not exist
-        exit 1
-    fi
-
-    if [ ! -f "${CUDA_PATH}\include\nvjpeg.h" ]; then
-        echo "nvjpeg does not exist"
-        exit 1
-    fi
-
-    # check cuda driver version
-    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
-        if [[ -x "$path" ]]; then
-            "$path" || echo "true";
-            break
-        fi
-    done
-
-    which nvcc
-    nvcc --version
-    env | grep CUDA
-fi
diff --git a/functorch/.circleci/unittest/windows/scripts/setup_env.sh b/functorch/.circleci/unittest/windows/scripts/setup_env.sh
deleted file mode 100644
index b0b706311120..000000000000
--- a/functorch/.circleci/unittest/windows/scripts/setup_env.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env bash
-
-# This script is for setting up environment in which unit test is ran.
-# To speed up the CI time, the resulting environment is cached.
-#
-# Do not install PyTorch and torchvision here, otherwise they also get cached.
-
-set -e
-
-this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-root_dir="$(git rev-parse --show-toplevel)"
-conda_dir="${root_dir}/conda"
-env_dir="${root_dir}/env"
-
-cd "${root_dir}"
-
-# 1. Install conda at ./conda
-if [ ! -d "${conda_dir}" ]; then
-    printf "* Installing conda\n"
-    export tmp_conda="$(echo $conda_dir | tr '/' '\\')"
-    export miniconda_exe="$(echo $root_dir | tr '/' '\\')\\miniconda.exe"
-    curl --output miniconda.exe https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -O
-    "$this_dir/install_conda.bat"
-    unset tmp_conda
-    unset miniconda_exe
-fi
-
-eval "$(${conda_dir}/Scripts/conda.exe 'shell.bash' 'hook')"
-
-# 2. Create test environment at ./env
-if [ ! -d "${env_dir}" ]; then
-    printf "* Creating a test environment\n"
-    conda create --prefix "${env_dir}" -y python="$PYTHON_VERSION"
-fi
-conda activate "${env_dir}"
-
-# 3. Install Conda dependencies
-printf "* Installing dependencies (except PyTorch)\n"
-conda env update --file "${this_dir}/environment.yml" --prune
diff --git a/functorch/.circleci/unittest/windows/scripts/vc_env_helper.bat b/functorch/.circleci/unittest/windows/scripts/vc_env_helper.bat
deleted file mode 100644
index 9410135677a4..000000000000
--- a/functorch/.circleci/unittest/windows/scripts/vc_env_helper.bat
+++ /dev/null
@@ -1,39 +0,0 @@
-@echo on
-
-set VC_VERSION_LOWER=16
-set VC_VERSION_UPPER=17
-
-for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do (
-    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
-        set "VS15INSTALLDIR=%%i"
-        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
-        goto vswhere
-    )
-)
-
-:vswhere
-if "%VSDEVCMD_ARGS%" == "" (
-    call "%VS15VCVARSALL%" x64 || exit /b 1
-) else (
-    call "%VS15VCVARSALL%" x64 %VSDEVCMD_ARGS% || exit /b 1
-)
-
-@echo on
-
-set DISTUTILS_USE_SDK=1
-
-set args=%1
-shift
-:start
-if [%1] == [] goto done
-set args=%args% %1
-shift
-goto start
-
-:done
-if "%args%" == "" (
-    echo Usage: vc_env_helper.bat [command] [args]
-    echo e.g. vc_env_helper.bat cl /c test.cpp
-)
-
-%args% || exit /b 1
diff --git a/functorch/.github/workflows/docs.yml b/functorch/.github/workflows/docs.yml
deleted file mode 100644
index 017d9949ff7b..000000000000
--- a/functorch/.github/workflows/docs.yml
+++ /dev/null
@@ -1,82 +0,0 @@
-name: Build and Deploy Docs
-on:
-  pull_request:
-    types: [opened, synchronize, reopened]
-  push:
-    branches:
-      - main
-
-jobs:
-
-  build-docs:
-    runs-on: ubuntu-18.04
-    steps:
-      - name: Setup Python
-        uses: actions/setup-python@v2
-        with:
-          python-version: "3.9"
-          architecture: x64
-      - name: Checkout functorch
-        uses: actions/checkout@v2
-      - name: Install PyTorch Nightly
-        run: |
-          python3 -mpip install --pre torch>=1.12.0.dev -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
-      - name: Install functorch
-        run: |
-          python3 setup.py install
-      - name: Install docs requirements
-        run: |
-          cd docs
-          python3 -mpip install -r requirements.txt
-      - name: Build docs
-        run: |
-          cd docs
-          make html
-      - name: Upload docs as GHA artifact
-        uses: actions/upload-artifact@v2
-        with:
-          name: built-docs
-          path: docs/build/html
-
-  deploy-docs:
-    needs: [build-docs]
-    runs-on: ubuntu-latest
-    if: (github.ref == 'refs/heads/main' && github.event_name == 'push')
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          ref: gh-pages
-          fetch-depth: 3
-
-      - name: Download docs artifact
-        uses: actions/download-artifact@v2
-        with:
-          name: built-docs
-          path: /tmp/docs
-
-      - name: Copy built docs to nightly
-        id: copy-docs
-        run: |
-          cp -R /tmp/docs/* nightly/
-          git log -3
-          # Set commit name and hash as variables: commit_name, commit_hash
-          echo "::set-output name=commit_name::$(git log -1 --format='%s')"
-          echo "::set-output name=commit_hash::$(git log -1 --format='%h')"
-
-      - name: Git reset to commit/amend
-        if: ${{ steps.copy-docs.outputs.commit_name == 'auto-generated commit' }}
-        run: |
-          # if commit_name is "auto-generated commit"
-          # then go back in commit history to commit to the same commit
-          git reset --soft ${{ steps.copy-docs.outputs.commit_hash }}~1
-          git log -3
-
-      - name: Commit and push to gh-pages
-        uses: github-actions-x/commit@v2.9
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          push-branch: 'gh-pages'
-          commit-message: 'auto-generated commit'
-          force-push: 'true'
-          name: gha
-          email: gha@email.org
diff --git a/functorch/.github/workflows/lint.yml b/functorch/.github/workflows/lint.yml
deleted file mode 100644
index 26e752e2c9b7..000000000000
--- a/functorch/.github/workflows/lint.yml
+++ /dev/null
@@ -1,63 +0,0 @@
-name: Lint
-
-on:
-  push:
-    branches:
-      - main
-  pull_request:
-
-jobs:
-  lintrunner:
-    runs-on: ubuntu-18.04
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          fetch-depth: 0
-
-      - name: Setup Python
-        uses: actions/setup-python@v2
-        with:
-          python-version: 3.8
-          architecture: x64
-
-      - name: Install lintrunner
-        run: pip install lintrunner==0.8.*
-
-      - name: Initialize lint dependencies
-        run: lintrunner init
-
-      - name: Run lintrunner on all files
-        if: github.event_name == 'push'
-        run: lintrunner -vv --paths-cmd='git grep -Il .' --force-color
-
-      - name: Run lintrunner on PR files
-        if: github.event_name == 'pull_request'
-        env:
-          PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
-        run: |
-          set +e
-          if ! lintrunner -vv --force-color --merge-base-with "${PR_BASE_SHA}" ; then
-              echo ""
-              echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m"
-              echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
-              exit 1
-          fi
-
-      - name: Store annotations
-        if: always() && github.event_name == 'pull_request'
-        # Don't show this as an error; the above step will have already failed.
-        continue-on-error: true
-        env:
-          PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
-        run: |
-          # The easiest way to get annotations is to just run lintrunner again
-          # in JSON mode and use jq to massage the output into GitHub Actions
-          # workflow commands.
-          lintrunner --merge-base-with "${PR_BASE_SHA}" --output=json | \
-            jq --raw-output '"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))'
-
-
-concurrency:
-  group: lint-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
diff --git a/functorch/.github/workflows/wheels.yml b/functorch/.github/workflows/wheels.yml
deleted file mode 100644
index 3b1ac1b94b86..000000000000
--- a/functorch/.github/workflows/wheels.yml
+++ /dev/null
@@ -1,61 +0,0 @@
-name: Wheels
-on:
-  pull_request:
-    types: [opened, synchronize, reopened]
-  push:
-    branches:
-      - main
-
-jobs:
-
-  build-wheel-linux:
-    runs-on: ubuntu-18.04
-    container: pytorch/manylinux-cpu
-    strategy:
-      matrix:
-        python_abi: [ "cp37-cp37m", "cp38-cp38", "cp39-cp39" ]
-    steps:
-      - name: Checkout functorch
-        uses: actions/checkout@v2
-      - name: Install PyTorch Nightly
-        run: |
-          export PATH="/opt/python/${{ matrix.python_abi }}/bin:$PATH"
-          python3 -mpip install --pre torch>=1.12.0.dev -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
-      - name: Build wheel
-        run: |
-          export PATH="/opt/python/${{ matrix.python_abi }}/bin:$PATH"
-          python3 -mpip install wheel
-          python3 setup.py bdist_wheel
-          # NB: wheels have the linux_x86_64 prefix, need to be manually renamed
-      - name: Upload wheel as GHA artifact
-        uses: actions/upload-artifact@v2
-        with:
-          name: functorch-linux.whl
-          path: dist/*.whl
-
-  build-wheel-mac:
-    runs-on: macos-latest
-    strategy:
-      matrix:
-        python_version: [ "3.7", "3.8", "3.9" ]
-    steps:
-      - name: Setup Python
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python_version }}
-          architecture: x64
-      - name: Checkout functorch
-        uses: actions/checkout@v2
-      - name: Install PyTorch Nightly
-        run: |
-          python3 -mpip install --pre torch>=1.12.0.dev -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
-      - name: Build wheel
-        run: |
-          export CC=clang CXX=clang++
-          python3 -mpip install wheel
-          python3 setup.py bdist_wheel
-      - name: Upload wheel as GHA artifact
-        uses: actions/upload-artifact@v2
-        with:
-          name: functorch-mac.whl
-          path: dist/*.whl
diff --git a/functorch/.lintrunner.toml b/functorch/.lintrunner.toml
deleted file mode 100644
index 6e0d756b53bf..000000000000
--- a/functorch/.lintrunner.toml
+++ /dev/null
@@ -1,48 +0,0 @@
-[[linter]]
-code = 'FLAKE8'
-include_patterns = ['**/*.py']
-exclude_patterns = [
-    '.git/**',
-    'benchmarks/**',
-    'docs/**',
-    'examples/**',
-    'notebooks/**',
-]
-command = [
-    'python3',
-    'tools/lint/flake8_linter.py',
-    '--',
-    '@{{PATHSFILE}}'
-]
-init_command = [
-    'python3',
-    'tools/lint/pip_init.py',
-    '--dry-run={{DRYRUN}}',
-    'flake8==3.8.2',
-    'flake8-bugbear==20.1.4',
-    'flake8-comprehensions==3.3.0',
-    'flake8-executable==2.0.4',
-    'flake8-pyi==20.5.0',
-    'mccabe==0.6.1',
-    'pycodestyle==2.6.0',
-    'pyflakes==2.2.0',
-]
-
-# [[linter]]
-# code = 'BLACK'
-# include_patterns = [
-#     '**/*.py',
-# ]
-# command = [
-#     'python3',
-#     'tools/lint/black_linter.py',
-#     '--',
-#     '@{{PATHSFILE}}'
-# ]
-# init_command = [
-#     'python3',
-#     'tools/lint/pip_init.py',
-#     '--dry-run={{DRYRUN}}',
-#     'black==22.3.0',
-# ]
-# is_formatter = true
diff --git a/functorch/CMakeLists.txt b/functorch/CMakeLists.txt
new file mode 100644
index 000000000000..d20304324382
--- /dev/null
+++ b/functorch/CMakeLists.txt
@@ -0,0 +1,38 @@
+cmake_minimum_required(VERSION 3.12)
+project(functorch)
+set(CMAKE_CXX_STANDARD 14)
+
+include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+
+set(FT_DIR csrc)
+file(GLOB_RECURSE FT_SOURCES ${FT_DIR}/*.cpp)
+
+add_library(${PROJECT_NAME} MODULE ${FT_SOURCES})
+target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+target_compile_definitions(${PROJECT_NAME} PRIVATE FUNCTORCH_BUILD_MAIN_LIB)
+target_compile_definitions(${PROJECT_NAME} PRIVATE TORCH_EXTENSION_NAME=_C)
+target_compile_definitions(${PROJECT_NAME} PRIVATE TORCH_API_INCLUDE_EXTENSION_H)
+target_compile_options(${PROJECT_NAME} PRIVATE ${TORCH_PYTHON_COMPILE_OPTIONS})
+target_link_libraries(${PROJECT_NAME} PRIVATE torch torch_python)
+target_link_libraries(${PROJECT_NAME} PRIVATE pybind::pybind11)
+
+set_target_properties(${PROJECT_NAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY
+      ${CMAKE_BINARY_DIR}/functorch)
+set_target_properties(${PROJECT_NAME} PROPERTIES INSTALL_RPATH "${_rpath_portable_origin}/../torch/lib")
+
+# Copy-pasted prefix/suffix logic for Python extensions from
+# https://github.com/pytorch/pytorch/blob/33bb8ae350611760139457b85842b1d7edf9aa11/caffe2/CMakeLists.txt#L1975
+# https://github.com/pytorch/pytorch/blob/33bb8ae350611760139457b85842b1d7edf9aa11/caffe2/CMakeLists.txt#L2022
+# TODO: It would be good to be able to use Python3_add_library target, but it does not work in many cases
+set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" DEBUG_POSTFIX "")
+if(WIN32)
+  set_target_properties(${PROJECT_NAME} PROPERTIES SUFFIX ".pyd")
+else()
+  set_target_properties(${PROJECT_NAME} PROPERTIES SUFFIX ".so")
+endif()
+# Needed to link functorch on MacOS
+if(NOT ${TORCH_PYTHON_LINK_FLAGS} STREQUAL "")
+  set_target_properties(${PROJECT_NAME} PROPERTIES LINK_FLAGS ${TORCH_PYTHON_LINK_FLAGS})
+endif()
+install(TARGETS ${PROJECT_NAME} DESTINATION "${CMAKE_CURRENT_SOURCE_DIR}")
diff --git a/functorch/CODE_OF_CONDUCT.md b/functorch/CODE_OF_CONDUCT.md
deleted file mode 100644
index b91e23b17c02..000000000000
--- a/functorch/CODE_OF_CONDUCT.md
+++ /dev/null
@@ -1,76 +0,0 @@
-# Code of Conduct
-
-## Our Pledge
-
-In the interest of fostering an open and welcoming environment, we as
-contributors and maintainers pledge to make participation in our project and
-our community a harassment-free experience for everyone, regardless of age, body
-size, disability, ethnicity, sex characteristics, gender identity and expression,
-level of experience, education, socio-economic status, nationality, personal
-appearance, race, religion, or sexual identity and orientation.
-
-## Our Standards
-
-Examples of behavior that contributes to creating a positive environment
-include:
-
-* Using welcoming and inclusive language
-* Being respectful of differing viewpoints and experiences
-* Gracefully accepting constructive criticism
-* Focusing on what is best for the community
-* Showing empathy towards other community members
-
-Examples of unacceptable behavior by participants include:
-
-* The use of sexualized language or imagery and unwelcome sexual attention or
-advances
-* Trolling, insulting/derogatory comments, and personal or political attacks
-* Public or private harassment
-* Publishing others' private information, such as a physical or electronic
-address, without explicit permission
-* Other conduct which could reasonably be considered inappropriate in a
-professional setting
-
-## Our Responsibilities
-
-Project maintainers are responsible for clarifying the standards of acceptable
-behavior and are expected to take appropriate and fair corrective action in
-response to any instances of unacceptable behavior.
-
-Project maintainers have the right and responsibility to remove, edit, or
-reject comments, commits, code, wiki edits, issues, and other contributions
-that are not aligned to this Code of Conduct, or to ban temporarily or
-permanently any contributor for other behaviors that they deem inappropriate,
-threatening, offensive, or harmful.
-
-## Scope
-
-This Code of Conduct applies within all project spaces, and it also applies when
-an individual is representing the project or its community in public spaces.
-Examples of representing a project or community include using an official
-project e-mail address, posting via an official social media account, or acting
-as an appointed representative at an online or offline event. Representation of
-a project may be further defined and clarified by project maintainers.
-
-## Enforcement
-
-Instances of abusive, harassing, or otherwise unacceptable behavior may be
-reported by contacting the project team at <conduct@pytorch.org>. All
-complaints will be reviewed and investigated and will result in a response that
-is deemed necessary and appropriate to the circumstances. The project team is
-obligated to maintain confidentiality with regard to the reporter of an incident.
-Further details of specific enforcement policies may be posted separately.
-
-Project maintainers who do not follow or enforce the Code of Conduct in good
-faith may face temporary or permanent repercussions as determined by other
-members of the project's leadership.
-
-## Attribution
-
-This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
-available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
-
-[homepage]: https://www.contributor-covenant.org
-
-For answers to common questions about this code of conduct, see
-https://www.contributor-covenant.org/faq
diff --git a/functorch/CONTRIBUTING.md b/functorch/CONTRIBUTING.md
deleted file mode 100644
index effa6ed2e28c..000000000000
--- a/functorch/CONTRIBUTING.md
+++ /dev/null
@@ -1,12 +0,0 @@
-## Contributing
-Feedback on our APIs, as well as finding bugs, would be very helpful.
-
-Please feel free to chat us up on the PyTorch Slack, or open an issue
-at https://github.com/pytorch/functorch if you're interested in
-contributing.
-
-To contribute a change to functorch, please make sure you are submitting a
-Pull Request to the functorch folder in https://github.com/pytorch/pytorch
-repository. The source of truth for functorch has moved there from
-https://github.com/pytorch/functorch ; the code in the pytorch/functorch
-repository is read-only.
diff --git a/functorch/LICENSE b/functorch/LICENSE
deleted file mode 100644
index 22f4f8f28d49..000000000000
--- a/functorch/LICENSE
+++ /dev/null
@@ -1,26 +0,0 @@
-Copyright (c) 2021 Facebook, Inc. and its affiliates. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice,
-   this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-   may be used to endorse or promote products derived from this software
-   without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/functorch/functorch/__init__.py b/functorch/__init__.py
similarity index 92%
rename from functorch/functorch/__init__.py
rename to functorch/__init__.py
index 4f5adc5a7ad1..99ab943dd929 100644
--- a/functorch/functorch/__init__.py
+++ b/functorch/__init__.py
@@ -32,7 +32,4 @@
     FunctionalModuleWithBuffers,
 )
 
-try:
-    from .version import __version__  # noqa: F401
-except ImportError:
-    pass
+__version__ = torch.__version__
diff --git a/functorch/functorch/_src/__init__.py b/functorch/_src/__init__.py
similarity index 100%
rename from functorch/functorch/_src/__init__.py
rename to functorch/_src/__init__.py
diff --git a/functorch/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
similarity index 97%
rename from functorch/functorch/_src/aot_autograd.py
rename to functorch/_src/aot_autograd.py
index 94f27655a139..41fd7049c969 100644
--- a/functorch/functorch/_src/aot_autograd.py
+++ b/functorch/_src/aot_autograd.py
@@ -407,9 +407,23 @@ def create_aot_dispatcher_function(
 
         def process_inputs(flat_args):
             if mode:
-                fake_flat_tensor_args = pytree.tree_map_only(
-                    Tensor, mode.from_tensor, flat_args
-                )
+                seen_args = set()
+
+                def convert(x):
+                    # HACK HACK HACK
+                    # preserve the same behavior of the non-fake tensor branch
+                    # of creating a unique tensor impl for each input,
+                    # instead of memoizing the conversion. this has the same
+                    # problem of models that resize their inputs described below,
+                    # but fixes an issue with tied parameters.
+                    # TODO: more full fix
+                    if id(x) in seen_args:
+                        with torch.utils._mode_utils.no_dispatch():
+                            x = x.detach().requires_grad_(x.requires_grad)
+                    seen_args.add(id(x))
+                    return mode.from_tensor(x)
+
+                fake_flat_tensor_args = pytree.tree_map_only(Tensor, convert, flat_args)
             else:
                 # The detach().requires_grad_() pattern can cause some subtle bugs.
                 # These will be fixed once FakeTensor is always-on for AOTAutograd.
diff --git a/functorch/functorch/_src/benchmark_utils.py b/functorch/_src/benchmark_utils.py
similarity index 100%
rename from functorch/functorch/_src/benchmark_utils.py
rename to functorch/_src/benchmark_utils.py
diff --git a/functorch/functorch/_src/compile_utils.py b/functorch/_src/compile_utils.py
similarity index 100%
rename from functorch/functorch/_src/compile_utils.py
rename to functorch/_src/compile_utils.py
diff --git a/functorch/functorch/_src/compilers.py b/functorch/_src/compilers.py
similarity index 100%
rename from functorch/functorch/_src/compilers.py
rename to functorch/_src/compilers.py
diff --git a/functorch/functorch/_src/config.py b/functorch/_src/config.py
similarity index 93%
rename from functorch/functorch/_src/config.py
rename to functorch/_src/config.py
index 76e5ad6eacb6..bcbe67b7592e 100644
--- a/functorch/functorch/_src/config.py
+++ b/functorch/_src/config.py
@@ -11,8 +11,7 @@
 
 use_functionalize = True
 
-# TODO Benchmark
-use_fake_tensor = False
+use_fake_tensor = True
 
 debug_partitioner = os.environ.get('AOT_PARTITIONER_DEBUG', False)
 # Prints out forward + backwards FX graphs
diff --git a/functorch/functorch/_src/eager_transforms.py b/functorch/_src/eager_transforms.py
similarity index 94%
rename from functorch/functorch/_src/eager_transforms.py
rename to functorch/_src/eager_transforms.py
index 8750172cb8ce..129fe9c1718e 100644
--- a/functorch/functorch/_src/eager_transforms.py
+++ b/functorch/_src/eager_transforms.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Callable, Union, Tuple, List, Any
+from typing import Callable, Union, Tuple, List, Any, Optional
 import torch
 from functools import partial, wraps
 import contextlib
@@ -260,13 +260,33 @@ def vjp(func: Callable, *primals, has_aux: bool = False):
         outer one. This is because ``vjp`` is a "function transform": its result
         should not depend on the result of a context manager outside of ``f``.
     """
+    return _vjp_with_argnums(func, *primals, has_aux=has_aux)
+
+
+def _vjp_with_argnums(func: Callable, *primals, argnums: Optional[argnums_t] = None, has_aux: bool = False):
+    # This is the same function as vjp but also accepts an argnums argument
+    # All args are the same as vjp except for the added argument
+    # argnums (Optional[int or tuple[int]]): Optional, specifies the argument(s) to compute gradients with respect to.
+    #         If None, computes the gradients with respect to all inputs (used for vjp). Default: None
+    #
+    # WARN: Users should NOT call this function directly and should just be calling vjp.
+    # It is only separated so that inputs passed to jacrev but not differentiated get the correct wrappers.
+    #
+    # NOTE: All error messages are produced as if vjp was being called, even if this was called by jacrev
+    #
+    # Returns the same two elements as :func:`vjp` but the function returned, vjp_fn, returns a tuple of VJPs
+    # for only the primal elements given by argnums.
     level = _grad_increment_nesting()
     try:
         # See NOTE [grad and vjp interaction with no_grad]
         with torch.enable_grad():
             primals = _wrap_all_tensors(primals, level)
-            diff_primals = _create_differentiable(primals, level)
-            primals_out = func(*diff_primals)
+            if argnums is None:
+                diff_primals = _create_differentiable(primals, level)
+            else:
+                diff_primals = _slice_argnums(primals, argnums, as_tuple=False)
+                tree_map_(partial(_create_differentiable, level=level), diff_primals)
+            primals_out = func(*primals)
 
             if has_aux:
                 if not (isinstance(primals_out, tuple) and len(primals_out) == 2):
@@ -436,8 +456,7 @@ def jacrev(func: Callable, argnums: Union[int, Tuple[int]] = 0, *, has_aux=False
     """
     @wraps(func)
     def wrapper_fn(*args):
-        f_wrapper, primals = _argnums_partial(func, args, argnums)
-        vjp_out = vjp(f_wrapper, *primals, has_aux=has_aux)
+        vjp_out = _vjp_with_argnums(func, *args, argnums=argnums, has_aux=has_aux)
         if has_aux:
             output, vjp_fn, aux = vjp_out
         else:
@@ -454,6 +473,7 @@ def wrapper_fn(*args):
 
         results = vmap(vjp_fn)(basis)
 
+        primals = _slice_argnums(args, argnums)
         flat_primals, primals_spec = tree_flatten(primals)
         flat_results, results_spec = tree_flatten(results)
 
@@ -617,15 +637,6 @@ def _slice_argnums(args, argnums, as_tuple=True):
     return tuple(args[i] for i in argnums)
 
 
-def _argnums_partial(f, args, argnums):
-    def f_wrapper(*wrapper_args):
-        replaced_args = _replace_args(args, wrapper_args, argnums)
-        return f(*replaced_args)
-    wrapper_args = _slice_argnums(args, argnums)
-    wrapper_args = wrapper_args if isinstance(wrapper_args, tuple) else (wrapper_args, )
-    return (f_wrapper, wrapper_args)
-
-
 JVP_NESTING = 0
 
 
@@ -775,11 +786,32 @@ def jvp(func: Callable, primals: Any, tangents: Any, *, strict: bool = False, ha
          >>> assert torch.allclose(output, x + y)
 
     """
+
+    return _jvp_with_argnums(func, primals, tangents, argnums=None, strict=strict, has_aux=has_aux)
+
+
+def _jvp_with_argnums(func: Callable, primals: Any, tangents: Any, argnums: Optional[argnums_t], *,
+                      strict: bool = False, has_aux: bool):
+    # This is the same function as jvp but also accepts an argnums argument
+    # Most args are the same as jvp except for the added argument
+    # argnums (Optional[int or tuple[int]]): Optional, specifies the argument(s) to compute gradients with respect to.
+    #         If None, computes the gradients with respect to all inputs (used for jvp). Default: None
+    # Because of this, tangents must be of length argnums and matches up to the corresponding primal whose index is
+    # given by argnums
+    #
+    # WARN: Users should NOT call this function directly and should just be calling jvp.
+    # It is only separated so that inputs passed to jacfwd but not differentiated get the correct wrappers.
+    #
+    # NOTE: All error messages are produced as if jvp was being called, even if this was called by jacfwd
+    #
+    # Returns the same two elements as :func:`jvp` but the returned tuple, ``jvp_out``, only has JVPs with respect to
+    # the primals given by argnums
     if not isinstance(primals, tuple):
         raise RuntimeError(
             f'{jvp_str}: Expected primals to be a tuple. '
             f'E.g. it should be valid to call f(*primals).')
-    flat_primals, primals_spec = tree_flatten(primals)
+    diff_args = primals if argnums is None else _slice_argnums(primals, argnums)
+    flat_primals, primals_spec = tree_flatten(diff_args)
     flat_tangents, tangents_spec = tree_flatten(tangents)
     if primals_spec != tangents_spec:
         raise RuntimeError(
@@ -800,6 +832,9 @@ def jvp(func: Callable, primals: Any, tangents: Any, *, strict: bool = False, ha
                 flat_duals = tuple(fwAD.make_dual(p, t)
                                    for p, t in zip(flat_primals, flat_tangents))
                 duals = tree_unflatten(flat_duals, primals_spec)
+                if argnums is not None:
+                    primals = _wrap_all_tensors(primals, level)
+                    duals = _replace_args(primals, duals, argnums)
                 result_duals = func(*duals)
                 if has_aux:
                     if not (isinstance(result_duals, tuple) and len(result_duals) == 2):
@@ -946,14 +981,14 @@ def jacfwd(func: Callable, argnums: argnums_t = 0, has_aux: bool = False, *, ran
     """
     @wraps(func)
     def wrapper_fn(*args):
-        f_wrapper, primals = _argnums_partial(func, args, argnums)
+        primals = args if argnums is None else _slice_argnums(args, argnums)
         flat_primals, primals_spec = tree_flatten(primals)
         flat_primals_numels = tuple(p.numel() for p in flat_primals)
         flat_basis = _construct_standard_basis_for(flat_primals, flat_primals_numels)
         basis = tree_unflatten(flat_basis, primals_spec)
 
         def push_jvp(basis):
-            output = jvp(f_wrapper, primals, basis, has_aux=has_aux)
+            output = _jvp_with_argnums(func, args, basis, argnums=argnums, has_aux=has_aux)
             if has_aux:
                 _, jvp_out, aux = output
                 return jvp_out, aux
diff --git a/functorch/functorch/_src/fx_minifier.py b/functorch/_src/fx_minifier.py
similarity index 100%
rename from functorch/functorch/_src/fx_minifier.py
rename to functorch/_src/fx_minifier.py
diff --git a/functorch/functorch/_src/make_functional.py b/functorch/_src/make_functional.py
similarity index 100%
rename from functorch/functorch/_src/make_functional.py
rename to functorch/_src/make_functional.py
diff --git a/functorch/functorch/_src/monkey_patching.py b/functorch/_src/monkey_patching.py
similarity index 90%
rename from functorch/functorch/_src/monkey_patching.py
rename to functorch/_src/monkey_patching.py
index 04ba4ab9bb63..ca8cec75bb4a 100644
--- a/functorch/functorch/_src/monkey_patching.py
+++ b/functorch/_src/monkey_patching.py
@@ -30,12 +30,14 @@ def _functorch_str(tensor, *, tensor_contents=None):
     value = _C.get_unwrapped(tensor)
     dl_enabled = _C.tls_set_is_included()
     try:
-        # Disable temporarily kDynamicLayerFrontModeKey/kDynamicLayerBackModeKey as included dispatch keys
+        # Disable temporarily FuncTorchDynamicLayerFrontMode and
+        # FuncTorchDynamicLayerBackMode as included dispatch keys
         if (dl_enabled):
             _C._set_dynamic_layer_keys_included(False)
         value_repr = repr(value)
     finally:
-        # Reenable kDynamicLayerFrontModeKey/kDynamicLayerBackModeKey as included dispatch keys
+        # Reenable FuncTorchDynamicLayerFrontMode and
+        # FuncTorchDynamicLayerBackMode as included dispatch keys
         if (dl_enabled):
             _C._set_dynamic_layer_keys_included(True)
 
diff --git a/functorch/functorch/_src/named_members_polyfill.py b/functorch/_src/named_members_polyfill.py
similarity index 100%
rename from functorch/functorch/_src/named_members_polyfill.py
rename to functorch/_src/named_members_polyfill.py
diff --git a/functorch/functorch/_src/partitioners.py b/functorch/_src/partitioners.py
similarity index 99%
rename from functorch/functorch/_src/partitioners.py
rename to functorch/_src/partitioners.py
index 325c66b631d2..bae18c5f7ca9 100644
--- a/functorch/functorch/_src/partitioners.py
+++ b/functorch/_src/partitioners.py
@@ -275,7 +275,7 @@ def classify_nodes(joint_module):
     aten = torch.ops.aten
     prims = torch.ops.prims
 
-    pointwise_ops = [aten.add, aten.sub, aten.div, aten.atan2, aten.mul, aten.max, aten.min, aten.pow, aten.remainder, aten.fmod, aten.__and__, aten.__or__, aten.__xor__, aten.__lshift__, aten.__rshift__, aten.eq, aten.ne, aten.ge, aten.gt, aten.le, aten.lt, aten.abs, aten.bitwise_not, aten.ceil, aten.floor, aten.frac, aten.neg, aten.relu, aten.round, aten.silu, aten.trunc, aten.log, aten.log10, aten.log1p, aten.log2, aten.lgamma, aten.exp, aten.expm1, aten.erf, aten.erfc, aten.cos, aten.acos, aten.cosh, aten.sin, aten.asin, aten.sinh, aten.tan, aten.atan, aten.tanh, aten.atanh, aten.sqrt, aten.rsqrt, aten.reciprocal, aten.sigmoid, aten.softplus, aten.threshold, aten.threshold_backward, aten.clamp, aten.where, aten.lerp, aten.addcmul, aten.gelu, aten.gelu_backward]  # noqa: E501
+    pointwise_ops = [aten.add, aten.sub, aten.div, aten.atan2, aten.mul, aten.max, aten.min, aten.pow, aten.remainder, aten.fmod, aten.__and__, aten.__or__, aten.__xor__, aten.__lshift__, aten.__rshift__, aten.eq, aten.ne, aten.ge, aten.gt, aten.le, aten.lt, aten.abs, aten.bitwise_not, aten.ceil, aten.floor, aten.frac, aten.neg, aten.relu, aten.round, aten.silu, aten.trunc, aten.log, aten.log10, aten.log1p, aten.log2, aten.lgamma, aten.exp, aten.expm1, aten.erf, aten.erfc, aten.cos, aten.acos, aten.cosh, aten.sin, aten.asin, aten.sinh, aten.tan, aten.atan, aten.tanh, aten.atanh, aten.sqrt, aten.rsqrt, aten.reciprocal, aten.sigmoid, aten.softplus, aten.threshold, aten.threshold_backward, aten.clamp, aten.where, aten.lerp, aten.addcmul, aten.gelu, aten.gelu_backward, aten.alias]  # noqa: E501
     if compiler == "inductor":
         pointwise_ops += [prims.div, prims.convert_element_type, aten.sign, aten.clone, aten._to_copy]  # noqa: E501
     misc_ops = [aten.to, aten.type_as, operator.getitem]
diff --git a/functorch/functorch/_src/python_key.py b/functorch/_src/python_key.py
similarity index 100%
rename from functorch/functorch/_src/python_key.py
rename to functorch/_src/python_key.py
diff --git a/functorch/functorch/_src/pytree_hacks.py b/functorch/_src/pytree_hacks.py
similarity index 100%
rename from functorch/functorch/_src/pytree_hacks.py
rename to functorch/_src/pytree_hacks.py
diff --git a/functorch/functorch/_src/top_operators_github_usage.py b/functorch/_src/top_operators_github_usage.py
similarity index 100%
rename from functorch/functorch/_src/top_operators_github_usage.py
rename to functorch/_src/top_operators_github_usage.py
diff --git a/functorch/functorch/_src/vmap.py b/functorch/_src/vmap.py
similarity index 100%
rename from functorch/functorch/_src/vmap.py
rename to functorch/_src/vmap.py
diff --git a/functorch/functorch/compile/__init__.py b/functorch/compile/__init__.py
similarity index 100%
rename from functorch/functorch/compile/__init__.py
rename to functorch/compile/__init__.py
diff --git a/functorch/functorch/csrc/ADInterpreters.cpp b/functorch/csrc/ADInterpreters.cpp
similarity index 100%
rename from functorch/functorch/csrc/ADInterpreters.cpp
rename to functorch/csrc/ADInterpreters.cpp
diff --git a/functorch/functorch/csrc/ADInterpreters.h b/functorch/csrc/ADInterpreters.h
similarity index 90%
rename from functorch/functorch/csrc/ADInterpreters.h
rename to functorch/csrc/ADInterpreters.h
index 6f79afc6144f..67f232adb0a9 100644
--- a/functorch/functorch/csrc/ADInterpreters.h
+++ b/functorch/csrc/ADInterpreters.h
@@ -3,6 +3,10 @@
 
 namespace at { namespace functorch {
 
+// These are the interpreters for our AD transforms
+// (grad, vjp and jvp).
+// See NOTE: [functorch interpreter stack] for more details.
+
 struct GradInterpreterPtr {
   explicit GradInterpreterPtr(const Interpreter* base): base_(base) { TORCH_INTERNAL_ASSERT(base->key() == TransformType::Grad); }
   TransformType key() const { return base_->key(); }
diff --git a/functorch/functorch/csrc/BatchRulesActivation.cpp b/functorch/csrc/BatchRulesActivation.cpp
similarity index 99%
rename from functorch/functorch/csrc/BatchRulesActivation.cpp
rename to functorch/csrc/BatchRulesActivation.cpp
index b761c70b1575..2fc841a45350 100644
--- a/functorch/functorch/csrc/BatchRulesActivation.cpp
+++ b/functorch/csrc/BatchRulesActivation.cpp
@@ -216,7 +216,7 @@ std::tuple<Tensor,optional<int64_t>,Tensor,optional<int64_t>> prelu_backward_bat
   return std::make_tuple(std::get<0>(grads), 0, std::get<1>(grads), (weight_grad_is_batched ? optional<int64_t>(0) : nullopt));
 }
 
-TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
+TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT(glu_backward, glu_backward_batch_rule);
   VMAP_SUPPORT(glu, glu_batch_rule);
   VMAP_SUPPORT(prelu, prelu_batch_rule)
diff --git a/functorch/functorch/csrc/BatchRulesBinaryOps.cpp b/functorch/csrc/BatchRulesBinaryOps.cpp
similarity index 99%
rename from functorch/functorch/csrc/BatchRulesBinaryOps.cpp
rename to functorch/csrc/BatchRulesBinaryOps.cpp
index 54e0f38622d6..8b3699d9b94d 100644
--- a/functorch/functorch/csrc/BatchRulesBinaryOps.cpp
+++ b/functorch/csrc/BatchRulesBinaryOps.cpp
@@ -53,7 +53,7 @@ struct BinaryRandomPointwiseBatchRuleHelper;
 template <typename F, F Func, typename T1, typename T2, typename... T>
 struct BinaryRandomPointwiseBatchRuleHelper<F, Func, typelist<T1, T2, T...>> {
   static Tensor apply(const Tensor& tensor, const Tensor& other, T... extra_args) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kVmapModeKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
     auto maybe_layer = maybeCurrentDynamicLayer();
     auto cur_level = maybe_layer->layerId();
     RandomnessType randomness = maybe_layer->randomness();
@@ -301,7 +301,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchVmapMode, m) {
   m.impl("binomial", BINARY_RANDOM_POINTWISE_BATCH_RULE(at::functorch::binomial_wrapper));
 }
 
-TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
+TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
 #define BINARY_POINTWISE2(op, overload) \
   VMAP_SUPPORT2(op, overload, BINARY_POINTWISE_BATCH_RULE(ATEN_FN2(op, overload)));
 #define BINARY_POINTWISE(op) \
diff --git a/functorch/functorch/csrc/BatchRulesConvolution.cpp b/functorch/csrc/BatchRulesConvolution.cpp
similarity index 90%
rename from functorch/functorch/csrc/BatchRulesConvolution.cpp
rename to functorch/csrc/BatchRulesConvolution.cpp
index 8382070283cd..f4b86ee8e538 100644
--- a/functorch/functorch/csrc/BatchRulesConvolution.cpp
+++ b/functorch/csrc/BatchRulesConvolution.cpp
@@ -49,18 +49,50 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens
     if (groups == 1) {
       auto new_w = reshape_dim_into(*rhs_bdim, rhs_spec[0], rhs);
       auto out = at::convolution(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
-      out = reshape_dim_outof(out_spec[1], rhs.sizes()[*rhs_bdim], out);
+      out = reshape_dim_outof(out_spec[1], rhs.size(*rhs_bdim), out);
       result = std::make_tuple(out, out_spec[1]);
     } else {
-      auto dim_with_groups = transposed ? 1 : 0;
-      auto new_w = reshape_dim_outof(rhs_spec[dim_with_groups] + (*rhs_bdim <= rhs_spec[0]), groups, rhs);
-      new_w = reshape_dim_into(*rhs_bdim + (rhs_spec[0] < rhs_bdim), rhs_spec[0] + 1, new_w);
-      new_w = reshape_dim_into(rhs_spec[0], rhs_spec[0], new_w);
-      auto out = at::convolution(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
-      out = reshape_dim_outof(out_spec[1], groups, out);
-      out = reshape_dim_outof(out_spec[1] + 1, rhs.sizes()[*rhs_bdim], out);
-      out = reshape_dim_into(out_spec[1], out_spec[1] + 1, out);
-      result = std::make_tuple(out, out_spec[1]);
+      if (transposed) {
+        // conv_transpose with groups is normally NIHW, IOHW -> N(GO)HW
+        // With RHS batched, we do the following:
+        // NIHW, BIOHW -> NIHW, I(BO)HW -> N(GBO)HW -> BN(GO)HW
+        // NB: the following isn't written using rhs_spec
+        // (PyTorch convs have a fixed dimension order)
+
+        // BIOHW -> I(BO)HW
+        auto new_w = reshape_dim_into(*rhs_bdim, 1, rhs);
+        // NIHW, I(BO)HW -> N(GBO)HW
+        auto out = at::convolution(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
+        // N(GBO)HW -> NG(BO)HW
+        out = reshape_dim_outof(1, groups, out);
+        // NG(BO)HW -> NGBOHW
+        out = reshape_dim_outof(2, rhs.size(*rhs_bdim), out);
+        // NGBOHW -> NB(GO)HW
+        out = reshape_dim_into(1, 2, out);
+        result = std::make_tuple(out, 1);
+      } else {
+        // conv with groups is normally N(GI)HW, (GO)IHW -> N(GO)HW
+        // With RHS batched, we do the following:
+        // N(GI)HW, B(GO)IHW -> N(GI)HW, (GBO)IHW -> N(GBO)HW -> BN(GO)HW
+        // NB: the following isn't written using rhs_spec
+        // (PyTorch convs have a fixed dimension order)
+
+        // B(GO)IHW -> BGOIHW
+        auto new_w = reshape_dim_outof(0 + (*rhs_bdim == 0), groups, rhs);
+        // BGOIHW -> G(BO)IHW
+        new_w = reshape_dim_into(*rhs_bdim + (*rhs_bdim > 0), 1, new_w);
+        // G(BO)IHW -> (GBO)IHW
+        new_w = reshape_dim_into(0, 0, new_w);
+        // N(GI)HW, (GBO)IHW -> N(GBO)HW
+        auto out = at::convolution(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
+        // N(GBO)HW -> NG(BO)HW
+        out = reshape_dim_outof(1, groups, out);
+        // NG(BO)HW -> NGBOHW
+        out = reshape_dim_outof(2, rhs.size(*rhs_bdim), out);
+        // NGBOHW -> NB(GO)HW
+        out = reshape_dim_into(1, 2, out);
+        result = std::make_tuple(out, 1);
+      }
     }
   } else if (lhs_bdim && rhs_bdim) {
     auto new_x = reshape_dim_into(*lhs_bdim, lhs_spec[1], lhs);
@@ -165,7 +197,7 @@ Tensor _convolution_decomp(
 //   std::tie(weight_value, weight_bdim) = unwrapTensorAtLevel(weight, cur_level);
 //
 //   if (self_bdim.has_value() && self_value.dim() == 5 && first_dim_has_size_1(self_value, *self_bdim) && grad_output_bdim.has_value() && !weight_bdim.has_value()) {
-//     c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+//     c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
 //     auto result = cudnn_conv_per_sample_grad_rule(
 //         self_value, self_bdim,
 //         grad_output_value, grad_output_bdim,
@@ -411,7 +443,7 @@ std::tuple<Tensor,Tensor,Tensor> convolution_backward_plumbing(
   int64_t cur_level = maybe_layer->layerId();
 
   if (!areAnyBatchedAtLevel({grad_output_, input_, weight_}, cur_level)){
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return at::convolution_backward(
         grad_output_, input_, weight_, bias_sizes_opt, stride, padding,
         dilation, transposed, output_padding, groups, output_mask);
@@ -448,7 +480,7 @@ std::tuple<Tensor,Tensor,Tensor> convolution_backward_plumbing(
   // BNO, BNI, BOI
   // AKA one of the model ensembling case
   if (grad_output_bdim && input_bdim && weight_bdim) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     grad_output = reshape_dim_into(*grad_output_bdim, 1, grad_output);
 
     // BNO, BNI, BOI -> N(BO), N(BI), (BO)I
@@ -471,7 +503,7 @@ std::tuple<Tensor,Tensor,Tensor> convolution_backward_plumbing(
 
   Tensor grad_input;
   if (output_mask[0]) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     const auto result = convolution_backward_input_batch_rule(
         grad_output, grad_output_bdim,
         input, input_bdim,
@@ -482,7 +514,7 @@ std::tuple<Tensor,Tensor,Tensor> convolution_backward_plumbing(
 
   Tensor grad_weight;
   if (output_mask[1]) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     const auto result = convolution_backward_weight_batch_rule(
         grad_output, grad_output_bdim,
         input, input_bdim,
@@ -504,7 +536,7 @@ std::tuple<Tensor,Tensor,Tensor> convolution_backward_plumbing(
 }
 
 
-TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
+TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT(convolution, convolution_batch_rule);
   m.impl("_convolution", _convolution_decomp);
   m.impl("convolution_backward", convolution_backward_plumbing);
diff --git a/functorch/functorch/csrc/BatchRulesDecompositions.cpp b/functorch/csrc/BatchRulesDecompositions.cpp
similarity index 98%
rename from functorch/functorch/csrc/BatchRulesDecompositions.cpp
rename to functorch/csrc/BatchRulesDecompositions.cpp
index 41bb842cd412..9d39a1942435 100644
--- a/functorch/functorch/csrc/BatchRulesDecompositions.cpp
+++ b/functorch/csrc/BatchRulesDecompositions.cpp
@@ -18,14 +18,14 @@ namespace at { namespace functorch {
 #define OP_DECOMPOSE(op)  m.impl(#op, static_cast<decltype(&ATEN_FN(op))>(native::op));
 #define OP_DECOMPOSE2(op, overload)  m.impl(#op"."#overload, static_cast<decltype(&ATEN_FN2(op, overload))>(native::op));
 
-TORCH_LIBRARY_IMPL(aten, FT_VMAP_MODE_KEY, m) {
+TORCH_LIBRARY_IMPL(aten, FuncTorchVmapMode, m) {
   OP_DECOMPOSE(alpha_dropout_);
   OP_DECOMPOSE(dropout_);
   OP_DECOMPOSE(feature_alpha_dropout_);
   OP_DECOMPOSE(feature_dropout_);
 }
 
-TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
+TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   OP_DECOMPOSE2(__and__, Scalar);
   OP_DECOMPOSE2(__and__, Tensor);
   OP_DECOMPOSE2(__iand__, Tensor);
@@ -147,6 +147,7 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   OP_DECOMPOSE2(linalg_matrix_norm, str_ord);
   OP_DECOMPOSE(linalg_multi_dot);
   OP_DECOMPOSE(linalg_norm);
+  OP_DECOMPOSE2(linalg_norm, ord_str);
   OP_DECOMPOSE(linalg_solve);
   OP_DECOMPOSE(linalg_solve_ex);
   OP_DECOMPOSE(linalg_svd);
diff --git a/functorch/functorch/csrc/BatchRulesDynamic.cpp b/functorch/csrc/BatchRulesDynamic.cpp
similarity index 93%
rename from functorch/functorch/csrc/BatchRulesDynamic.cpp
rename to functorch/csrc/BatchRulesDynamic.cpp
index e752d96d168d..be21963fdf48 100644
--- a/functorch/functorch/csrc/BatchRulesDynamic.cpp
+++ b/functorch/csrc/BatchRulesDynamic.cpp
@@ -10,6 +10,10 @@
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <c10/util/Metaprogramming.h>
 
+// This file contains batching rules for operations that return Tensors of
+// dynamic shape. We generally don't support those with vmap so we raise
+// errors for them.
+
 
 namespace at { namespace functorch {
 
@@ -57,7 +61,7 @@ void unsupportedAllclose(const c10::OperatorHandle& op, torch::jit::Stack* stack
         "support over at github.com/pytorch/functorch/issues/275");
 }
 
-TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
+TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
     UNSUPPORTED_DYNAMIC(nonzero);
     UNSUPPORTED_DYNAMIC(where);
     UNSUPPORTED_DYNAMIC(unique);
diff --git a/functorch/functorch/csrc/BatchRulesFactory.cpp b/functorch/csrc/BatchRulesFactory.cpp
similarity index 99%
rename from functorch/functorch/csrc/BatchRulesFactory.cpp
rename to functorch/csrc/BatchRulesFactory.cpp
index 160b4c752f57..a3fe46107dca 100644
--- a/functorch/functorch/csrc/BatchRulesFactory.cpp
+++ b/functorch/csrc/BatchRulesFactory.cpp
@@ -107,7 +107,7 @@ bool _has_same_storage_numel_batch_rule(const Tensor& a, const Tensor& b) {
   return true;
 }
 
-TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
+TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   m.impl("_has_same_storage_numel", _has_same_storage_numel_batch_rule);
   VMAP_SUPPORT(ones_like, BASIC_UNARY_BATCH_RULE(ATEN_FN(ones_like)));
   VMAP_SUPPORT(zeros_like, BASIC_UNARY_BATCH_RULE(ATEN_FN(zeros_like)));
diff --git a/functorch/functorch/csrc/BatchRulesHelper.cpp b/functorch/csrc/BatchRulesHelper.cpp
similarity index 93%
rename from functorch/functorch/csrc/BatchRulesHelper.cpp
rename to functorch/csrc/BatchRulesHelper.cpp
index dfd690ac2168..d49ecd5e8737 100644
--- a/functorch/functorch/csrc/BatchRulesHelper.cpp
+++ b/functorch/csrc/BatchRulesHelper.cpp
@@ -133,20 +133,6 @@ void vmapIncompatibleInplaceError(const char* schema_name) {
     "please file a bug report instead.");
 }
 
-void run_jit_decomposition(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  const auto& schema = op.schema();
-  // TODO: templatize based on op and keep static trace_exec
-  auto * trace_exec = torch::jit::GetDecompositionExecutor(schema);
-  trace_exec->run((*stack));
-  if (stack->back().isTuple()) {
-    IValue tup = stack->back();
-    stack->pop_back();
-    for (const auto& elem: tup.toTuple()->elements()) {
-      stack->push_back(elem);
-    }
-  }
-}
-
 static void handleScalarTypePromotion(Tensor& logical_scalar_tensor, Tensor& second) {
   auto result_type = at::native::result_type(logical_scalar_tensor[0], second);
   if (logical_scalar_tensor.scalar_type() != result_type) {
diff --git a/functorch/functorch/csrc/BatchRulesHelper.h b/functorch/csrc/BatchRulesHelper.h
similarity index 97%
rename from functorch/functorch/csrc/BatchRulesHelper.h
rename to functorch/csrc/BatchRulesHelper.h
index 552a38b20e20..6296a77524e3 100644
--- a/functorch/functorch/csrc/BatchRulesHelper.h
+++ b/functorch/csrc/BatchRulesHelper.h
@@ -16,10 +16,12 @@
 #include <functorch/csrc/BatchedFallback.h>
 #include <functorch/csrc/PlumbingHelper.h>
 #include <ATen/core/dispatch/Dispatcher.h>
-#include <functorch/csrc/Constants.h>
 #include <ATen/VmapGeneratedPlumbing.h>
 
+// This file contains helper functions for batching rules.
+
 namespace at { namespace functorch {
+
 Tensor reshape_dim_into(int64_t src, int64_t dst, const Tensor& x);
 Tensor reshape_dim_outof(int64_t src, int64_t size1, const Tensor& x);
 
@@ -119,7 +121,7 @@ void boxed_tensor_inputs_batch_rule(const c10::OperatorHandle& op, torch::jit::S
   const auto num_returns = schema.returns().size();
   const auto num_arguments = schema.arguments().size();
 
-  c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
   TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
   int64_t cur_level = maybe_layer->layerId();
@@ -195,12 +197,6 @@ inline void handle_variadic_bdims(std::vector<std::pair<Tensor, optional<int64_t
 #define VARIADIC_BDIMS_BOXED(op) \
   m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_tensor_inputs_batch_rule<decltype(&handle_variadic_bdims), &handle_variadic_bdims>>());
 
-void run_jit_decomposition(const c10::OperatorHandle& op, torch::jit::Stack* stack);
-
-#define RUN_JIT_DECOMPOSITION(op) \
-  m.impl(#op, torch::CppFunction::makeFromBoxedFunction<&run_jit_decomposition>());
-
-
 using UnpackedBatchedTensor = std::tuple<Tensor,optional<int64_t>>;
 
 inline void find_and_unpack_tensors(
@@ -243,7 +239,7 @@ inline void boxed_existing_bdim_all_batch_rule(
   const auto num_returns = schema.returns().size();
   const auto num_arguments = schema.arguments().size();
 
-  c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
   TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
   int64_t cur_level = maybe_layer->layerId();
@@ -299,7 +295,7 @@ inline void boxed_all_tensors_have_optional_bdim(
   const auto num_returns = schema.returns().size();
   const auto num_arguments = schema.arguments().size();
 
-  c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
   TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
   int64_t cur_level = maybe_layer->layerId();
diff --git a/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp b/functorch/csrc/BatchRulesLinearAlgebra.cpp
similarity index 98%
rename from functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp
rename to functorch/csrc/BatchRulesLinearAlgebra.cpp
index 46d98e83c0b4..d1b42027ce76 100644
--- a/functorch/functorch/csrc/BatchRulesLinearAlgebra.cpp
+++ b/functorch/csrc/BatchRulesLinearAlgebra.cpp
@@ -468,43 +468,43 @@ matrix_rank_atol_rtol_float_batch_rule(
 
 #define LINALG_CHECK_MATRIX_UNARY_ONE_OUT(fn, op_name) \
   LINALG_STRING_CONST(fn, op_name);\
-  TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {\
+  TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {\
     VMAP_SUPPORT(fn, LINALG_CHECK_MATRIX_UNARY_BATCH_RULE(fn, one));\
   }
 
 #define LINALG_CHECK_MATRIX_UNARY_ONE_OUT2(fn, overload, op_name) \
   LINALG_STRING_CONST2(fn, overload, op_name);\
-  TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {\
+  TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {\
     VMAP_SUPPORT2(fn, overload, LINALG_CHECK_MATRIX_UNARY_BATCH_RULE2(fn, overload, one));\
   }
 
 #define LINALG_CHECK_MATRIX_UNARY_TWO_OUT(fn, op_name) \
   LINALG_STRING_CONST(fn, op_name);\
-  TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {\
+  TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {\
     VMAP_SUPPORT(fn, LINALG_CHECK_MATRIX_UNARY_BATCH_RULE(fn, two));\
   }
 
 #define LINALG_CHECK_MATRIX_UNARY_THREE_OUT(fn, op_name) \
   LINALG_STRING_CONST(fn, op_name);\
-  TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {\
+  TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {\
     VMAP_SUPPORT(fn, LINALG_CHECK_MATRIX_UNARY_BATCH_RULE(fn, three));\
   }
 
 #define LINALG_CHECK_MATRIX_UNARY_FOUR_OUT(fn, op_name) \
   LINALG_STRING_CONST(fn, op_name);\
-  TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {\
+  TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {\
     VMAP_SUPPORT(fn, LINALG_CHECK_MATRIX_UNARY_BATCH_RULE(fn, four));\
   }
 
 #define LINALG_CHECK_MATRIX_BINARY_ONE_OUT(fn, op_name) \
   LINALG_STRING_CONST(fn, op_name);\
-  TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {\
+  TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {\
     VMAP_SUPPORT(fn, LINALG_CHECK_MATRIX_BINARY_BATCH_RULE(fn, one));\
   }
 
 #define LINALG_CHECK_MATRIX_BINARY_TWO_OUT(fn, op_name) \
   LINALG_STRING_CONST(fn, op_name);\
-  TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {\
+  TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {\
     VMAP_SUPPORT(fn, LINALG_CHECK_MATRIX_BINARY_BATCH_RULE(fn, two));\
   }
 
@@ -531,7 +531,7 @@ LINALG_CHECK_MATRIX_UNARY_TWO_OUT(_linalg_eigh, linalg.eigh);
 LINALG_CHECK_MATRIX_UNARY_FOUR_OUT(_linalg_slogdet, linalg.slogdet);
 LINALG_CHECK_MATRIX_UNARY_THREE_OUT(_linalg_svd, linalg.svd);
 
-TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
+TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT(bmm, bmm_batch_rule);
   m.impl("addmv", addmv_decomp);
   m.impl("addmm", addmm_decomp);
diff --git a/functorch/functorch/csrc/BatchRulesLoss.cpp b/functorch/csrc/BatchRulesLoss.cpp
similarity index 95%
rename from functorch/functorch/csrc/BatchRulesLoss.cpp
rename to functorch/csrc/BatchRulesLoss.cpp
index 16ee2fb7e9c1..a9b47ab58a5b 100644
--- a/functorch/functorch/csrc/BatchRulesLoss.cpp
+++ b/functorch/csrc/BatchRulesLoss.cpp
@@ -64,7 +64,7 @@ Tensor binary_cross_entropy_plumbing(
 
   if (!isBatchedAtLevel(self, cur_level) && !isBatchedAtLevel(target, cur_level)
       && !isBatchedAtLevel(weight, cur_level)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return at::binary_cross_entropy(self, target, weight, reduction);
   }
 
@@ -77,7 +77,7 @@ Tensor binary_cross_entropy_plumbing(
 
   Tensor result;
   if (self_bdim || target_bdim) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     const auto bdim_size = get_bdim_size2(self_value, self_bdim, target_value, target_bdim);
     auto self_ = moveBatchDimToFront(self_value, self_bdim);
     auto target_ = moveBatchDimToFront(target_value, target_bdim);
@@ -86,7 +86,7 @@ Tensor binary_cross_entropy_plumbing(
     result = at::binary_cross_entropy(self_, target_, nullopt, Reduction::None);
     result = makeBatched(result, 0, cur_level);
   } else {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     result = at::binary_cross_entropy(self_value, target_value, nullopt, Reduction::None);
   }
   if (weight.has_value() && weight->defined()) {
@@ -103,7 +103,7 @@ Tensor binary_cross_entropy_backward_plumbing(
   int64_t cur_level = maybe_layer->layerId();
 
   if (!areAnyBatchedAtLevel({grad, input, target, weight_opt}, cur_level)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return at::binary_cross_entropy_backward(grad, input, target, weight_opt, reduction);
   }
 
@@ -120,7 +120,7 @@ Tensor binary_cross_entropy_backward_plumbing(
 
   Tensor grad_input;
   if (grad_bdim || input_bdim || target_bdim) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     const auto bdim_size = get_bdim_size3(
         grad_value, grad_bdim, input_value, input_bdim, target_value, target_bdim);
 
@@ -136,7 +136,7 @@ Tensor binary_cross_entropy_backward_plumbing(
         grad_, input_, target_, nullopt, Reduction::None);
     grad_input = makeBatched(grad_input, 0, cur_level);
   } else {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     grad_input = at::binary_cross_entropy_backward(
         grad_value, input_value, target_value, nullopt, Reduction::None);
   }
@@ -276,7 +276,7 @@ at::Tensor nll_loss_backward_decomposition(
   return grad_input * grad_output_;
 }
 
-TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
+TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   m.impl("nll_loss_forward", nll_loss_forward_decomposition);
   m.impl("nll_loss2d_forward", nll_loss_forward_decomposition);
   m.impl("nll_loss_backward", nll_loss_backward_decomposition);
diff --git a/functorch/functorch/csrc/BatchRulesModules.cpp b/functorch/csrc/BatchRulesModules.cpp
similarity index 99%
rename from functorch/functorch/csrc/BatchRulesModules.cpp
rename to functorch/csrc/BatchRulesModules.cpp
index 3d54ba5d0fe4..0ed7d8448e0a 100644
--- a/functorch/functorch/csrc/BatchRulesModules.cpp
+++ b/functorch/csrc/BatchRulesModules.cpp
@@ -386,7 +386,7 @@ struct CudnnGridSampleBackwardBatchRuleHelper {
   EXISTING_BDIM(op);
 
 
-TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
+TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   EXISTING_BDIM(im2col);
   EXISTING_BDIM(im2col_backward);
 
diff --git a/functorch/functorch/csrc/BatchRulesNorm.cpp b/functorch/csrc/BatchRulesNorm.cpp
similarity index 97%
rename from functorch/functorch/csrc/BatchRulesNorm.cpp
rename to functorch/csrc/BatchRulesNorm.cpp
index e78538329582..94e5ca08aba9 100644
--- a/functorch/functorch/csrc/BatchRulesNorm.cpp
+++ b/functorch/csrc/BatchRulesNorm.cpp
@@ -279,7 +279,7 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> batch_norm_backward_plumbing(
     std::tie(grad_normalized_input_value, grad_normalized_input_bdim) =
         unwrapTensorAtLevel(grad_normalized_input.transpose(0, 1), cur_level);       // [B0, B, C, *]
 
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     const auto results = batch_norm_backward_no_weight_bias_batch_rule<F, Func>(
         grad_normalized_input_value, grad_normalized_input_bdim,
         input_value, input_bdim,
@@ -308,7 +308,7 @@ std::tuple<Tensor,Tensor,Tensor> native_group_norm_plumbing(
   int64_t cur_level = maybe_layer->layerId();
 
   if (!areAnyBatchedAtLevel({input, weight_opt, bias_opt}, cur_level)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return at::native_group_norm(input, weight_opt, bias_opt, N, C, HxW, group, eps);
   }
 
@@ -323,13 +323,13 @@ std::tuple<Tensor,Tensor,Tensor> native_group_norm_plumbing(
     const auto input_ = reshape_dim_into(*input_bdim, 0, input_value);
     const auto bdim_size = input_value.size(*input_bdim);
 
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     const auto result = at::native_group_norm(input_, nullopt, nullopt, N * bdim_size, C, HxW, group, eps);
     result0 = makeBatched(reshape_dim_outof(0, bdim_size, std::get<0>(result)), 0, cur_level);
     mean = makeBatched(reshape_dim_outof(0, bdim_size, std::get<1>(result)), 0, cur_level);
     rstd = makeBatched(reshape_dim_outof(0, bdim_size, std::get<2>(result)), 0, cur_level);
   } else {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     const auto result = at::native_group_norm(input_value, nullopt, nullopt, N, C, HxW, group, eps);
     result0 = std::get<0>(result);
     mean = std::get<1>(result);
@@ -397,7 +397,7 @@ std::tuple<Tensor,Tensor,Tensor> native_group_norm_backward_plumbing(
   int64_t cur_level = maybe_layer->layerId();
 
   if (!areAnyBatchedAtLevel({grad_out, input, mean, rstd, weight_opt}, cur_level)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return at::native_group_norm_backward(grad_out, input, mean, rstd, weight_opt, N, C, HxW, group, output_mask);
   }
 
@@ -441,7 +441,7 @@ std::tuple<Tensor,Tensor,Tensor> native_group_norm_backward_plumbing(
     std::tie(grad_normalized_input_value, grad_normalized_input_bdim) =
         unwrapTensorAtLevel(grad_normalized_input, cur_level);
 
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     const auto res = group_norm_backward_no_weight_bias_batch_rule(
         grad_normalized_input_value, grad_normalized_input_bdim,
         input_value, input_bdim,
@@ -607,7 +607,7 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward_plumbing
   TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
   int64_t cur_level = maybe_layer->layerId();
   if (!areAnyBatchedAtLevel({grad_out, input, mean, rstd, weight_opt, bias_opt}, cur_level)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return at::native_layer_norm_backward(grad_out, input, normalized_shape, mean, rstd,
         weight_opt, bias_opt, output_mask);
   }
@@ -667,7 +667,7 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward_plumbing
     std::tie(grad_normalized_input_value, grad_normalized_input_bdim) =
         unwrapTensorAtLevel(grad_normalized_input, cur_level);
 
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     const auto results = native_layer_norm_backward_no_weight_bias_batch_rule(
         grad_normalized_input_value, grad_normalized_input_bdim,
         input_value, input_bdim,
@@ -761,7 +761,7 @@ struct NativeBatchNormBackwardBatchRuleHelper {
 
     if (!areAnyBatchedAtLevel({grad_out, input, weight_opt, running_mean_opt,
           running_var_opt, save_mean_opt, save_rstd_opt}, cur_level)) {
-      c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+      c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
       return at::native_batch_norm_backward(grad_out, input, weight_opt,
           running_mean_opt, running_var_opt, save_mean_opt, save_rstd_opt,
           training, eps, output_mask);
@@ -791,7 +791,7 @@ struct CudnnBatchNormBackwardBatchRuleHelper {
 
     if (!areAnyBatchedAtLevel({input, grad_out, weight, running_mean_opt,
           running_var_opt, save_mean_opt, save_rstd_opt, reserve}, cur_level)) {
-      c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+      c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
       return at::cudnn_batch_norm_backward(input, grad_out, weight,
           running_mean_opt, running_var_opt, save_mean_opt, save_rstd_opt, eps, reserve);
     }
@@ -819,7 +819,7 @@ struct MiopenBatchNormBackwardBatchRuleHelper {
 
     if (!areAnyBatchedAtLevel({input, grad_out, weight, running_mean_opt,
           running_var_opt, save_mean_opt, save_rstd_opt}, cur_level)) {
-      c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+      c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
       return at::miopen_batch_norm_backward(input, grad_out, weight,
           running_mean_opt, running_var_opt, save_mean_opt, save_rstd_opt, eps);
     }
@@ -875,7 +875,7 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_batch_norm_backward_wrapper(
     return at::miopen_batch_norm_backward(input, grad_out, weight_opt, running_mean_opt, running_var_opt, save_mean_opt, save_rstd_opt, eps);
   }
 
-TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
+TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT(native_batch_norm, NATIVE_BATCH_NORM_BATCH_RULE(native_batch_norm));
   VMAP_SUPPORT(cudnn_batch_norm, CUDNN_BATCH_NORM_BATCH_RULE(cudnn_batch_norm));
   VMAP_SUPPORT(miopen_batch_norm, MIOPEN_BATCH_NORM_BATCH_RULE(miopen_batch_norm));
diff --git a/functorch/functorch/csrc/BatchRulesPooling.cpp b/functorch/csrc/BatchRulesPooling.cpp
similarity index 98%
rename from functorch/functorch/csrc/BatchRulesPooling.cpp
rename to functorch/csrc/BatchRulesPooling.cpp
index a04cba329697..2487790c7e5d 100644
--- a/functorch/functorch/csrc/BatchRulesPooling.cpp
+++ b/functorch/csrc/BatchRulesPooling.cpp
@@ -35,7 +35,7 @@ max_pool2d_with_indices_batch_rule(
       reshape_dim_outof(0, bdim_size, std::get<1>(result)), 0);
 }
 
-TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
+TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   EXISTING_BDIM(_adaptive_avg_pool2d);
   EXISTING_BDIM_ALL_BOXED(_adaptive_avg_pool2d_backward);
   EXISTING_BDIM(_adaptive_avg_pool3d);
diff --git a/functorch/functorch/csrc/BatchRulesRandomness.cpp b/functorch/csrc/BatchRulesRandomness.cpp
similarity index 95%
rename from functorch/functorch/csrc/BatchRulesRandomness.cpp
rename to functorch/csrc/BatchRulesRandomness.cpp
index cd4f7ee4cc3b..e59229749ace 100644
--- a/functorch/functorch/csrc/BatchRulesRandomness.cpp
+++ b/functorch/csrc/BatchRulesRandomness.cpp
@@ -8,12 +8,18 @@
 #include <functorch/csrc/DynamicLayer.h>
 #include <functorch/csrc/BatchRulesHelper.h>
 
+// This file contains batching rules for random operations. These are different
+// from our regular batching rules: regular batching rules get registered to the
+// FuncTorchBatched key, but batching rules for random operations get
+// registered to FuncTorchVmapMode. This is because we need to interpose on
+// random operations even if they're not on a BatchedTensor.
+
 namespace at {
 namespace functorch {
 
 template <typename F, F Func, typename... ExtraArgs>
 Tensor random_batching_rule(IntArrayRef shape, ExtraArgs... extra_args) {
-  c10::impl::ExcludeDispatchKeyGuard guard(kVmapModeKey);
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
   auto maybe_layer = maybeCurrentDynamicLayer();
   VmapDimVector shapeVec(1, maybe_layer->batchSize());
   shapeVec.reserve(shape.size() + 1);
@@ -29,7 +35,7 @@ Tensor random_batching_rule(IntArrayRef shape, ExtraArgs... extra_args) {
 
 template <typename F, F Func, typename... ExtraArgs>
 Tensor& random_inplace_batching_rule(Tensor& self, ExtraArgs... extra_args) {
-  c10::impl::ExcludeDispatchKeyGuard guard(kVmapModeKey);
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
   auto maybe_layer = maybeCurrentDynamicLayer();
   const auto cur_level = maybe_layer->layerId();
   Tensor self_value;
@@ -54,7 +60,7 @@ Tensor& random_inplace_batching_rule(Tensor& self, ExtraArgs... extra_args) {
 }
 
 Tensor& bernoulli_inplace_Tensor_batching_rule(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
-  c10::impl::ExcludeDispatchKeyGuard guard(kVmapModeKey);
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
   auto maybe_layer = maybeCurrentDynamicLayer();
   auto cur_level = maybe_layer->layerId();
   RandomnessType randomness = maybe_layer->randomness();
@@ -104,7 +110,7 @@ Tensor& bernoulli_inplace_Tensor_batching_rule(Tensor& self, const Tensor& p_, c
 
 template <typename F, F Func, typename... ExtraArgs>
 Tensor randperm_batching_rule(int64_t n, ExtraArgs... extra_args) {
-  c10::impl::ExcludeDispatchKeyGuard guard(kVmapModeKey);
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
   auto maybe_layer = maybeCurrentDynamicLayer();
   auto const batch_size = maybe_layer->batchSize();
   RandomnessType randomness = maybe_layer->randomness();
@@ -124,7 +130,7 @@ Tensor randperm_batching_rule(int64_t n, ExtraArgs... extra_args) {
 
 template <typename F, F Func, typename... ExtraArgs>
 Tensor unary_pointwise_random_batch_rule(const Tensor& tensor, ExtraArgs... extra_args) {
-  c10::impl::ExcludeDispatchKeyGuard guard(kVmapModeKey);
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
   auto maybe_layer = maybeCurrentDynamicLayer();
   const auto cur_level = maybe_layer->layerId();
 
@@ -152,7 +158,7 @@ Tensor unary_pointwise_random_batch_rule(const Tensor& tensor, ExtraArgs... extr
 
 template<typename F, F Func, typename... ExtraArgs>
 Tensor tensor_like_random_batch_rule(const Tensor& self, ExtraArgs... extra_args) {
-  c10::impl::ExcludeDispatchKeyGuard guard(kVmapModeKey);
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
   auto maybe_layer = maybeCurrentDynamicLayer();
   const auto cur_level = maybe_layer->layerId();
   RandomnessType randomness = maybe_layer->randomness();
@@ -178,7 +184,7 @@ Tensor tensor_like_random_batch_rule(const Tensor& self, ExtraArgs... extra_args
 }
 
 std::tuple<Tensor,Tensor> native_dropout_batching_rule(const Tensor& tensor, double p, c10::optional<bool> train) {
-  c10::impl::ExcludeDispatchKeyGuard guard(kVmapModeKey);
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
   auto maybe_layer = maybeCurrentDynamicLayer();
   const auto cur_level = maybe_layer->layerId();
   RandomnessType randomness = maybe_layer->randomness();
@@ -208,7 +214,7 @@ std::tuple<Tensor,Tensor> native_dropout_batching_rule(const Tensor& tensor, dou
 }
 
 Tensor multinomial_batching_rule(const Tensor& self, const int64_t num_samples, const bool replacement, const c10::optional<Generator> generator) {
-  c10::impl::ExcludeDispatchKeyGuard guard(kVmapModeKey);
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
   auto maybe_layer = maybeCurrentDynamicLayer();
   const auto cur_level = maybe_layer->layerId();
 
diff --git a/functorch/functorch/csrc/BatchRulesReduceOps.cpp b/functorch/csrc/BatchRulesReduceOps.cpp
similarity index 98%
rename from functorch/functorch/csrc/BatchRulesReduceOps.cpp
rename to functorch/csrc/BatchRulesReduceOps.cpp
index 5c4652bf0725..e4f9245dafb7 100644
--- a/functorch/functorch/csrc/BatchRulesReduceOps.cpp
+++ b/functorch/csrc/BatchRulesReduceOps.cpp
@@ -70,14 +70,14 @@ void boxed_reduction_batch_rule(const c10::OperatorHandle& op, torch::jit::Stack
   const auto num_returns = schema.returns().size();
   const auto num_arguments = schema.arguments().size();
 
-  c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
   TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
   int64_t cur_level = maybe_layer->layerId();
 
   auto orig_arguments = torch::jit::last(*stack, num_arguments);
   if (std::none_of(orig_arguments.begin(), orig_arguments.end(), ivalueParticipatesInCurrentLevel)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     op.callBoxed(stack);
     return;
   }
@@ -368,7 +368,7 @@ std::tuple<Tensor,optional<int64_t>> searchsorted_batch_rule(
   TORCH_INTERNAL_ASSERT(false);
 }
 
-TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
+TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT2(searchsorted, Tensor, searchsorted_batch_rule);
   REDUCTION_BOXED(_fft_r2c);
   REDUCTION_BOXED(_fft_c2r);
diff --git a/functorch/functorch/csrc/BatchRulesScatterOps.cpp b/functorch/csrc/BatchRulesScatterOps.cpp
similarity index 99%
rename from functorch/functorch/csrc/BatchRulesScatterOps.cpp
rename to functorch/csrc/BatchRulesScatterOps.cpp
index da01d464908e..def1f4f29a0f 100644
--- a/functorch/functorch/csrc/BatchRulesScatterOps.cpp
+++ b/functorch/csrc/BatchRulesScatterOps.cpp
@@ -317,7 +317,7 @@ std::tuple<Tensor,optional<int64_t>> index_batch_rule(
 // plumbing done since we don't support List<optional<Tensor>> in codegen
 Tensor index_plumbing(const Tensor & self, const List<optional<Tensor>> & indices
 ) {
-  c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
   TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
   int64_t cur_level = maybe_layer->layerId();
@@ -504,7 +504,7 @@ void index_put__batch_rule(
 // plumbing done since we don't support List<optional<Tensor>> in codegen
 Tensor& index_put__plumbing(Tensor & self, const List<optional<Tensor>> & indices
 , const Tensor & values, bool accumulate) {
-  c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
   TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
   int64_t cur_level = maybe_layer->layerId();
@@ -543,7 +543,7 @@ void _index_put_impl__batch_rule(
 // plumbing done since we don't support List<optional<Tensor>> in codegen
 Tensor &_index_put_impl__plumbing(Tensor &self, const List<optional<Tensor>> &indices,
                                   const Tensor &values, bool accumulate, bool unsafe) {
-  c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
   TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
   int64_t cur_level = maybe_layer->layerId();
@@ -664,7 +664,7 @@ std::tuple<Tensor,optional<int64_t>> index_put_batch_rule(
 // plumbing done since we don't support List<optional<Tensor>> in codegen
 Tensor index_put_plumbing(const Tensor & self, const List<optional<Tensor>> & indices,
                           const Tensor & values, bool accumulate) {
-  c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
   TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
   int64_t cur_level = maybe_layer->layerId();
@@ -1050,7 +1050,7 @@ std::tuple<Tensor,optional<int64_t>> masked_fill_scalar_batch_rule(
   return std::make_tuple(result, 0);
 }
 
-TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
+TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   m.impl("index.Tensor", index_plumbing);
   m.impl("index_put_", index_put__plumbing);
   m.impl("index_put", index_put_plumbing);
diff --git a/functorch/functorch/csrc/BatchRulesUnaryOps.cpp b/functorch/csrc/BatchRulesUnaryOps.cpp
similarity index 99%
rename from functorch/functorch/csrc/BatchRulesUnaryOps.cpp
rename to functorch/csrc/BatchRulesUnaryOps.cpp
index 660cb1f3c713..993f1087d25b 100644
--- a/functorch/functorch/csrc/BatchRulesUnaryOps.cpp
+++ b/functorch/csrc/BatchRulesUnaryOps.cpp
@@ -79,7 +79,7 @@ to_other_batch_rule(const Tensor& self, optional<int64_t> self_bdim,
   return std::make_tuple(self.to(other, non_blocking, copy, memory_format), self_bdim);
 }
 
-TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
+TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
 
 #define UNARY_POINTWISE_ALL2(op, overload) \
   POINTWISE_BOXED2(op ## _, overload); \
diff --git a/functorch/functorch/csrc/BatchRulesViews.cpp b/functorch/csrc/BatchRulesViews.cpp
similarity index 97%
rename from functorch/functorch/csrc/BatchRulesViews.cpp
rename to functorch/csrc/BatchRulesViews.cpp
index 68a6c377f750..1ec484ecf853 100644
--- a/functorch/functorch/csrc/BatchRulesViews.cpp
+++ b/functorch/csrc/BatchRulesViews.cpp
@@ -15,6 +15,7 @@
 #include <c10/core/SymIntArrayRef.h>
 #include <c10/util/SmallBuffer.h>
 #include <ATen/InferSize.h>
+#include <torch/csrc/jit/runtime/decomposition_registry.h>
 
 namespace at { namespace functorch {
 
@@ -58,7 +59,7 @@ namespace at { namespace functorch {
 //
 // Now that we have written `sum_batch_rule`, we have to register it inside a
 // TORCH_LIBRARY_IMPL block:
-//   TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
+//   TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
 //     ...
 //     VMAP_SUPPORT2(sum, int, sum_batch_rule);
 //     ...
@@ -79,7 +80,7 @@ namespace at { namespace functorch {
 //     return torch.add(self, product, value);
 //   }
 // And register it inside a TORCH_LIBRARY_IMPL block:
-//   TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
+//   TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
 //     ...
 //     m.impl("addcmul", addcmul_decomp);
 //     ...
@@ -175,7 +176,7 @@ const Tensor& resize__plumbing(
   TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
   int64_t cur_level = maybe_layer->layerId();
   if (!isBatchedAtLevel(self, cur_level)) {
-    c10::impl::ExcludeDispatchKeyGuard guard2(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard2(DispatchKey::FuncTorchBatched);
     return self.resize_(size, optional_memory_format);
   }
 
@@ -190,7 +191,7 @@ const Tensor& resize__plumbing(
   TORCH_INTERNAL_ASSERT(self_bdim.value() == 0, "NYI: resize_ batch rule for batch dim != 0");
 
   // Resize the wrapped tensor
-  c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   self_value = moveBatchDimToFront(self_value, self_bdim);
   VmapDimVector new_size(size);
   new_size.insert(new_size.begin(), self_value.size(*self_bdim));
@@ -505,12 +506,12 @@ std::tuple<Tensor, optional<int64_t>> diag_embed_batch_rule(const Tensor& self,
   return std::make_tuple(at::diag_embed(self_, offset, dim1, dim2), 0);
 }
 
-TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
+TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT(diag, diag_batch_rule);
   VMAP_SUPPORT(chunk, chunk_batching_rule);
   m.impl("flatten.using_ints", static_cast<decltype(&ATEN_FN2(flatten, using_ints))>(native::flatten));
   VMAP_SUPPORT(flip, flip_batch_rule);
-  RUN_JIT_DECOMPOSITION(trace)
+  m.impl("trace", torch::CppFunction::makeFromBoxedFunction<&torch::jit::run_jit_decomposition>());
   VMAP_SUPPORT(tril, VARIADIC_BDIMS_BATCH_RULE(ATEN_FN(tril)));
   VMAP_SUPPORT(triu, VARIADIC_BDIMS_BATCH_RULE(ATEN_FN(triu)));
   VMAP_SUPPORT(repeat, repeat_batch_rule);
diff --git a/functorch/functorch/csrc/BatchedFallback.cpp b/functorch/csrc/BatchedFallback.cpp
similarity index 98%
rename from functorch/functorch/csrc/BatchedFallback.cpp
rename to functorch/csrc/BatchedFallback.cpp
index 6b6c58b243ee..c1f6b0c77306 100644
--- a/functorch/functorch/csrc/BatchedFallback.cpp
+++ b/functorch/csrc/BatchedFallback.cpp
@@ -6,7 +6,6 @@
 
 #include <functorch/csrc/BatchedFallback.h>
 #include <functorch/csrc/LegacyVmapTransforms.h>
-#include <functorch/csrc/Constants.h>
 #include <functorch/csrc/TensorWrapper.h>
 #include <functorch/csrc/DynamicLayer.h>
 #include <functorch/csrc/PlumbingHelper.h>
@@ -268,7 +267,7 @@ void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Sta
               "We could not generate a fallback.");
 
   if (std::none_of(arguments.begin(), arguments.end(), ivalueParticipatesInCurrentLevel)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     op.callBoxed(stack);
     return;
   }
@@ -354,7 +353,7 @@ void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Sta
       // argument is a BatchedTensor
       TORCH_INTERNAL_ASSERT(input_physical_views_iter != input_physical_views.end());
       const auto& physical_view_for_argument = *input_physical_views_iter;
-      c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+      c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
       torch::jit::push(stack, physical_view_for_argument.tensor().index(index));
       batched_tensor_inputs_pos_iter++;
       input_physical_views_iter++;
@@ -362,7 +361,7 @@ void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Sta
 
     // std::cout << "[Fallback]: ";
     // at::dump_tensor((*stack)[stack->size() - 1].toTensor());
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     op.callBoxed(stack);
 
     // Store the result into `output_shards`. See NOTE: [Output shards layout]
@@ -379,7 +378,7 @@ void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Sta
   auto output_shards_chunks = MatrixRef<Tensor>(output_shards, num_batches);
   for (const auto return_idx : c10::irange(0, num_returns)) {
     auto shards = output_shards_chunks[return_idx];
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     auto flat_output = safeStack(shards);
     // See NOTE [vmap through backward and undefined grad]
     if (!flat_output.defined()) {
diff --git a/functorch/functorch/csrc/BatchedFallback.h b/functorch/csrc/BatchedFallback.h
similarity index 69%
rename from functorch/functorch/csrc/BatchedFallback.h
rename to functorch/csrc/BatchedFallback.h
index 9130245f28b1..a914692abd7f 100644
--- a/functorch/functorch/csrc/BatchedFallback.h
+++ b/functorch/csrc/BatchedFallback.h
@@ -12,14 +12,19 @@
 namespace at {
 namespace functorch {
 
+// This file contains code for the vmap fallback (also known as the
+// BatchedTensor fallback or the Batched fallback). This code runs
+// when an operation doesn't have a batching rule implemented.
+
 // If an operator doesn't have a batching rule implemented then we fallback
-// to this implementation. The fallback only works on out-of-place operators
-// that return only tensors with new memory. (e.g., no in-place operators, no
-// view operations).
+// to this implementation. The fallback doesn't work on out= variants or
+// view operations; that is, it works for out-of-place operations and
+// in-place non-view operations.
 //
-// The fallback effectively takes all of the BatchedTensors in `stack`, slices
-// them, and runs `op` on all of the corresponding slices to produce slices
-// of the outputs. The output slices then get `torch.stack`ed to create the
+// For out-of-place operations, the fallback effectively takes all of the
+// BatchedTensors in `stack`, slices them, and runs `op` on all of the
+// corresponding slices to produce slices of the outputs. The output slices
+// then get `torch.stack`ed to create the
 // final returns.
 //
 // The performance of the fallback is not very good because it introduces an
@@ -27,9 +32,13 @@ namespace functorch {
 // write batching rules for operators whenever possible.
 void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack);
 
+// The vmap fallback emits a warning by default, but it may be disabled if
+// the user finds it to be too annoying.
 bool isVmapFallbackWarningEnabled();
 void setVmapFallbackWarningEnabled(bool enabled);
 
+// Used for testing. The vmap fallback is enabled by default. When it is disabled,
+// it raises an error.
 bool isVmapFallbackEnabled();
 void setVmapFallbackEnabled(bool enabled);
 
@@ -43,8 +52,8 @@ template <typename A, typename B, typename C> std::tuple<A, B, C> vector_to_resu
   return std::make_tuple(buffer[0].to<A>(), buffer[1].to<B>(), buffer[2].to<B>());
 }
 
-// This is a way to call the slow fallback from inside some plumbing
-// TODO: Probably better way to metaprogram this
+// slow_fallback is a way to call the vmap fallback inside some boxed kernel.
+// There is probably some better way to metaprogram this.
 template <typename Ret>
 Ret slow_fallback(const c10::OperatorHandle& op, ArrayRef<IValue> args) {
   std::vector<IValue> stack(args.begin(), args.end());
diff --git a/functorch/functorch/csrc/BatchingMetaprogramming.h b/functorch/csrc/BatchingMetaprogramming.h
similarity index 92%
rename from functorch/functorch/csrc/BatchingMetaprogramming.h
rename to functorch/csrc/BatchingMetaprogramming.h
index e054e58568be..e77960f441fe 100644
--- a/functorch/functorch/csrc/BatchingMetaprogramming.h
+++ b/functorch/csrc/BatchingMetaprogramming.h
@@ -8,6 +8,14 @@
 #include <ATen/Tensor.h>
 #include <ATen/VmapGeneratedPlumbing.h>
 
+// This file contains template metaprogramming things that are used for our
+// batching rules.
+//
+// See NOTE: [vmap plumbing] for more details on why this is necessary.
+// The plumbing has a bunch of metaprogramming hacks for determining the signature
+// of a batching rule from the signature of the operator, many of which use the
+// helper functions in this file.
+
 namespace at {
 namespace functorch {
 
diff --git a/functorch/functorch/csrc/CompileCache.cpp b/functorch/csrc/CompileCache.cpp
similarity index 100%
rename from functorch/functorch/csrc/CompileCache.cpp
rename to functorch/csrc/CompileCache.cpp
diff --git a/functorch/functorch/csrc/CompileCache.h b/functorch/csrc/CompileCache.h
similarity index 87%
rename from functorch/functorch/csrc/CompileCache.h
rename to functorch/csrc/CompileCache.h
index e67b1db63eb3..4e4023dab2c0 100644
--- a/functorch/functorch/csrc/CompileCache.h
+++ b/functorch/csrc/CompileCache.h
@@ -10,6 +10,8 @@
 namespace at {
 namespace functorch {
 
+// CompileCache is the compilation cache used for AOTAutograd.
+
 /// Initialize python bindings for kernel compilation cache.
 void initCompileCacheBindings(PyObject *module);
 
diff --git a/functorch/functorch/csrc/DynamicLayer.cpp b/functorch/csrc/DynamicLayer.cpp
similarity index 81%
rename from functorch/functorch/csrc/DynamicLayer.cpp
rename to functorch/csrc/DynamicLayer.cpp
index 08cd4d7a7d6b..bca15cdee0f9 100644
--- a/functorch/functorch/csrc/DynamicLayer.cpp
+++ b/functorch/csrc/DynamicLayer.cpp
@@ -6,7 +6,7 @@
 
 #include <functorch/csrc/DynamicLayer.h>
 #include <functorch/csrc/TensorWrapper.h>
-#include <functorch/csrc/BatchedTensorImpl.h>
+#include <ATen/functorch/BatchedTensorImpl.h>
 #include <functorch/csrc/BatchRulesHelper.h>
 
 #include <torch/library.h>
@@ -22,8 +22,8 @@ namespace at {
 namespace functorch {
 
 void setDynamicLayerFrontBackKeysIncluded(bool included) {
-  c10::impl::tls_set_dispatch_key_included(kDynamicLayerFrontModeKey, included);
-  c10::impl::tls_set_dispatch_key_included(kDynamicLayerBackModeKey, included);
+  c10::impl::tls_set_dispatch_key_included(DispatchKey::FuncTorchDynamicLayerFrontMode, included);
+  c10::impl::tls_set_dispatch_key_included(DispatchKey::FuncTorchDynamicLayerBackMode, included);
 }
 
 DynamicLayer::DynamicLayer(
@@ -75,6 +75,8 @@ RandomnessType DynamicLayer::randomness() const {
   return VmapInterpreterPtr(&interpreter_).randomness();
 }
 
+// Maps level to life handle, see NOTE: [Life handles and lexically scoped transforms]
+// for details
 using DynmetaData = std::unordered_map<int64_t, std::shared_ptr<bool>>;
 DynmetaData kDynMetaDataSingleton;
 
@@ -82,6 +84,13 @@ static DynmetaData& getGlobalDynmetaData() {
   return kDynMetaDataSingleton;
 }
 
+// functorch stores some TLS. Inside the TLS is the stack of transforms.
+// Unfortunately, since functorch isn't a part of libtorch, we have
+// a level of indirection. FuncTorchTLSBase is the interface that lives in libtorch,
+// while FuncTorchTLS implements all the methods and stores data.
+//
+// TODO: after functorch C++ code is moved into PyTorch, we can get rid of
+// this layer of indirection.
 class FuncTorchTLS : public FuncTorchTLSBase {
  public:
   FuncTorchTLS() {}
@@ -262,17 +271,11 @@ DynamicLayer popDynamicLayerAndDeleteMetadata() {
   auto level = result.layerId();
 
   // TODO: is this lock safe? No one else should be writing to the same bucket
-  // if (c10::show_dispatch_trace_enabled()) {
-  //   std::cout << "deleting metadata" << std::endl;
-  // }
   auto& data = getGlobalDynmetaData();
   auto it = data.find(level);
   if (it == data.end()) {
     return result;
   }
-  // if (c10::show_dispatch_trace_enabled()) {
-  //   std::cout << "deleted metadata for level " << level << std::endl;
-  // }
   // invalidate the thing
   *(it->second) = false;
   data.erase(level);
@@ -389,43 +392,34 @@ WithoutTop::~WithoutTop() {
   pushDynamicLayer(std::move(layer_));
 }
 
-// NOTE: [forward-mode AD decompositions hack]
-//
-// The mechanism is: in DynamicLayerFrontMode, IF we are dispatching on the
-// jvp transform, AND we have a decomposition for the operation, then run
-// the decomposition.
+// NOTE: [functorch front and back key fallbacks]
 //
-// Let's break that down. There are a douple of moving pieces.
+// Please read NOTE: [functorch interpreter stack] first for some context.
+// The following doc also provides some visuals:
+// https://docs.google.com/document/d/14qyaa3xIjmVxYiMLlIlQErunYgR_uR1WupsKMZlnGY4/edit
 //
-// 0. How do we know what transform we're dispatching on?
-// Easy, check the top of the DynamicLayerStack and read the transform.
+// functorch's "stack of transforms" is implemented as the following:
+// - each transform is associated with one or more dispatch keys in the PyTorch
+//   dispatcher. For example, vmap -> {FuncTorchBatched, FuncTorchVmapMode},
+//   Autograd -> {Autograd{Backend}, ADInplaceOrView}
+// - Whenever a functorch transform is active, the FuncTorchDynamicLayer{Front, Back}Mode
+//   keys are added to the dispatcher's local dispatch key set.
 //
-// 1. Next, we must identify when an operation (e.g. nll_loss_backward)
-// gets dispatched to.
-// - register a special kernel to the DynamicLayerFrontMode key
-//   (see JVP_DECOMP)
-// - that special kernel invokes dynamicLayerFrontFallbackOperator with
-//   an arg indicating we're going to use a decomp
+// DynamicLayerFrontMode is responsible for:
+// 1. selecting the transform that is at the top of the stack and grabbing its
+//    interpreter
+// 2. Calling interpreter.process(), which does the following:
+// 2a. enables/disables a bunch of dispatch keys, so that the only dispatch
+//     keys that are enabled are the ones that belong to the transform.
+// 2b. redispatching
 //
-// 2. Next, we need to call the decomposition. See call_decomposition_for_jvp.
-// We currently use python decompositions that we torchscript.
-
-// Ideally c10::OperatorHandle would have a field like this
-// to identify the operator.
-// The stuff here should map 1:1 with the operator name.
-// aten::nll_loss_backward -> nll_loss_backward
-// aten::add.Tensor -> add_Tensor
+// Eventually, DynamicLayerBackMode captures the redispatch from the transforms.
+// DynamicLayerBackMode is responsible for:
+// - redirecting back to DynamicLayerFrontMode
 
-static void call_decomposition_for_jvp(
+static void dynamicLayerFrontFallback(
     const c10::OperatorHandle& op,
     torch::jit::Stack* stack) {
-  run_jit_decomposition(op, stack);
-}
-
-static void dynamicLayerFrontFallbackOperator(
-    const c10::OperatorHandle& op,
-    torch::jit::Stack* stack,
-    bool decomp_jvp) {
   auto& dynamicLayerStack = dynamicLayerStackAccessor();
   TORCH_INTERNAL_ASSERT(dynamicLayerStack.size() > 0);
 #ifdef HAS_TORCH_SHOW_DISPATCH_TRACE
@@ -434,13 +428,6 @@ static void dynamicLayerFrontFallbackOperator(
     dump_local_tls();
   }
 #endif
-
-  // Hack: if jvp and we have a decomposition registered, then do the decomposition
-  if (dynamicLayerStack.back().interpreter().key() == TransformType::Jvp &&
-      decomp_jvp) {
-    return call_decomposition_for_jvp(op, stack);
-  }
-
   // Save the current LocalDispatchKeySet (to the current DynamicLayer).
   // Upon exiting the current scope, that LocalDispatchKeySet gets restored.
   // When the current DynamicLayer dispatches to the next (inner) DynamicLayer,
@@ -460,16 +447,6 @@ restoreLocalDispatchKeySetRAII(const c10::impl::LocalDispatchKeySet& key_set) {
   return c10::impl::ForceDispatchKeyGuard(key_set);
 }
 
-void dynamicLayerFrontFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  return dynamicLayerFrontFallbackOperator(op, stack, false);
-}
-
-void dynamicLayerFrontFallBackWithDecomp(
-    const c10::OperatorHandle& op,
-    torch::jit::Stack* stack) {
-  return dynamicLayerFrontFallbackOperator(op, stack, true);
-}
-
 void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
   auto& layer = dynamicLayerStackAccessor().back();
   auto restore_guard = restoreLocalDispatchKeySetRAII(layer.interpreter().getSavedLocalDispatchKeySet());
@@ -478,32 +455,13 @@ void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack*
   layer.interpreter().sendToNextInterpreter(op, stack);
 }
 
-TORCH_LIBRARY_IMPL(_, FT_DYNAMIC_LAYER_FRONT_MODE_KEY, m) {
+TORCH_LIBRARY_IMPL(_, FuncTorchDynamicLayerFrontMode, m) {
   m.fallback(torch::CppFunction::makeFromBoxedFunction<&dynamicLayerFrontFallback>());
 }
 
-TORCH_LIBRARY_IMPL(_, FT_DYNAMIC_LAYER_BACK_MODE_KEY, m) {
+TORCH_LIBRARY_IMPL(_, FuncTorchDynamicLayerBackMode, m) {
   m.fallback(torch::CppFunction::makeFromBoxedFunction<&dynamicLayerBackFallback>());
 }
 
-#define JVP_DECOMP(op) \
-  m.impl(#op, torch::CppFunction::makeFromBoxedFunction<&dynamicLayerFrontFallBackWithDecomp>());
-
-#define JVP_DECOMP2(op, overload) \
-  m.impl(#op "." #overload, torch::CppFunction::makeFromBoxedFunction<&dynamicLayerFrontFallBackWithDecomp>());
-
-TORCH_LIBRARY_IMPL(aten, FT_DYNAMIC_LAYER_FRONT_MODE_KEY, m) {
-  JVP_DECOMP(nll_loss_backward);
-  JVP_DECOMP(nll_loss2d_backward);
-  JVP_DECOMP(_log_softmax_backward_data);
-  JVP_DECOMP(_softmax_backward_data);
-  OP_DECOMPOSE(log_sigmoid);
-  JVP_DECOMP(log_sigmoid_forward);
-  JVP_DECOMP(native_layer_norm_backward);
-  JVP_DECOMP(native_batch_norm_backward);
-  JVP_DECOMP(cudnn_batch_norm_backward);
-}
-
-
 }
 } // namespace at
diff --git a/functorch/functorch/csrc/DynamicLayer.h b/functorch/csrc/DynamicLayer.h
similarity index 69%
rename from functorch/functorch/csrc/DynamicLayer.h
rename to functorch/csrc/DynamicLayer.h
index 7d5b5f4a9d82..d3569c450416 100644
--- a/functorch/functorch/csrc/DynamicLayer.h
+++ b/functorch/csrc/DynamicLayer.h
@@ -18,12 +18,28 @@
 #include <functorch/csrc/ADInterpreters.h>
 #include <functorch/csrc/FunctionalizeInterpreter.h>
 
-// Forward declared bc I am lazy
+// Forward declared
 namespace c10 { struct AutogradMetaInterface; }
 
 namespace at {
 namespace functorch {
 
+// This file contains the implementation of functorch's interpreter stack.
+// See NOTE: [functorch interpreter stack] first before reading on.
+//
+// NB: the functorch interpreter stack is also referred to as:
+// - the "dynamic layer stack" -- an older name for "interpreter" was
+//   "dynamic layer".
+// - the "functorch mode stack". You can think of each functorch transform as a
+//   "mode" (in the same sense as torch_dispatch mode or torch_function mode),
+//   and functorch being an implementation of a "mode stack" where the modes
+//   may be arbitrary composed.
+
+// DynamicLayer is basically the same thing as an Interpreter.
+// It represents a functorch transform and it holds an Interpreter,
+// which contains metadata related to the transform and instructions on
+// how to perform the transform.
+//
 // TODO: we can excise DynamicLayer in favor of Interpreter,
 // But I am going to leave it for now as a compatiblity shim to avoid
 // needing to refactor a lot of callsites...
@@ -68,6 +84,15 @@ FUNCTORCH_API void setDynamicLayerFrontBackKeysIncluded(bool included);
 // prevent race conditions.
 FUNCTORCH_API bool areTransformsActive();
 
+// NOTE: [Life handles and lexically scoped transforms]
+// functorch transforms are lexically scoped.
+// Given a level, we store a "life handle" that is a boolean that tells us if the
+// transform with that level is active or not.
+//
+// functorch's TensorWrapper (for grad transforms) stores a life handle.
+// If a TensorWrapper escapes from the scope of the transform, then somehow
+// it must know it escaped; it can tell by querying the life handle.
+//
 // NB: not lock safe. TODO: does it need a lock?
 FUNCTORCH_API std::shared_ptr<bool> getLifeHandleForLevel(int64_t level);
 
@@ -85,6 +110,8 @@ Tensor unwrapIfDead(const Tensor& tensor);
 std::ostream& operator<<(std::ostream& os, const DynamicLayer& layer);
 std::ostream& operator<<(std::ostream& os, const std::vector<DynamicLayer>& dynamicLayerStack);
 
+// While a functorch grad transform is active, Tensor.requires_grad_() gets
+// disabled. These two functions are the mechanism to controlling that.
 void setInplaceRequiresGradAllowed(bool allowed);
 bool getInplaceRequiresGradAllowed();
 
diff --git a/functorch/functorch/csrc/FunctionalizeInterpreter.cpp b/functorch/csrc/FunctionalizeInterpreter.cpp
similarity index 100%
rename from functorch/functorch/csrc/FunctionalizeInterpreter.cpp
rename to functorch/csrc/FunctionalizeInterpreter.cpp
diff --git a/functorch/functorch/csrc/FunctionalizeInterpreter.h b/functorch/csrc/FunctionalizeInterpreter.h
similarity index 85%
rename from functorch/functorch/csrc/FunctionalizeInterpreter.h
rename to functorch/csrc/FunctionalizeInterpreter.h
index 5475b38f068f..57ec747f8f44 100644
--- a/functorch/functorch/csrc/FunctionalizeInterpreter.h
+++ b/functorch/csrc/FunctionalizeInterpreter.h
@@ -3,6 +3,9 @@
 
 namespace at { namespace functorch {
 
+// This is the interpreter that handles the functionalize() transform.
+// See NOTE: [functorch interpreter stack] for more details.
+
 struct FunctionalizeInterpreterPtr {
   explicit FunctionalizeInterpreterPtr(const Interpreter* base): base_(base) { TORCH_INTERNAL_ASSERT(base->key() == TransformType::Functionalize); }
   TransformType key() const { return base_->key(); }
diff --git a/functorch/functorch/csrc/Interpreter.cpp b/functorch/csrc/Interpreter.cpp
similarity index 82%
rename from functorch/functorch/csrc/Interpreter.cpp
rename to functorch/csrc/Interpreter.cpp
index cce9fa05f70e..69500d651b0b 100644
--- a/functorch/functorch/csrc/Interpreter.cpp
+++ b/functorch/csrc/Interpreter.cpp
@@ -1,5 +1,5 @@
 #include <functorch/csrc/Interpreter.h>
-#include <functorch/csrc/BatchedTensorImpl.h>
+#include <ATen/functorch/BatchedTensorImpl.h>
 #include <functorch/csrc/TensorWrapper.h>
 #include <functorch/csrc/VmapInterpreter.h>
 #include <functorch/csrc/FunctionalizeInterpreter.h>
@@ -12,18 +12,18 @@ static DispatchKeySet get_all_dynlayer_keyset() {
 
   // "all dispatch keys between DynamicLayer{Front, Back}Mode, inclusive"
   auto result =
-    DispatchKeySet(DispatchKeySet::FULL_AFTER, kDynamicLayerFrontModeKey) -
-    DispatchKeySet(DispatchKeySet::FULL_AFTER, kDynamicLayerBackModeKey);
-  result = result | DispatchKeySet({kDynamicLayerFrontModeKey});
+    DispatchKeySet(DispatchKeySet::FULL_AFTER, DispatchKey::FuncTorchDynamicLayerFrontMode) -
+    DispatchKeySet(DispatchKeySet::FULL_AFTER, DispatchKey::FuncTorchDynamicLayerBackMode);
+  result = result | DispatchKeySet({DispatchKey::FuncTorchDynamicLayerFrontMode});
 
   // Hack: don't handle the autocast dispatch keys. Their interaction with functorch
   // is weird.
   result = result - autocast_dispatch_keyset;
 
-  // Hack: don't handle kVmapModeKey. We need a better way of modeling this.
-  // In e.g. grad(vmap(f)), kVmapModeKey makes it so that all random operations,
+  // Hack: don't handle DispatchKey::FuncTorchVmapMode. We need a better way of modeling this.
+  // In e.g. grad(vmap(f)), DispatchKey::FuncTorchVmapMode makes it so that all random operations,
   // even after we are done handling the vmap layer, error out.
-  result = result.remove(kVmapModeKey);
+  result = result.remove(DispatchKey::FuncTorchVmapMode);
 
   return result;
 }
@@ -34,10 +34,10 @@ static DispatchKeySet all_dynlayer_keyset = get_all_dynlayer_keyset();
 
 static DispatchKeySet keysForEnteringDynamicLayer(TransformType key) {
   if (key == TransformType::Vmap) {
-    // NB: Does not include kVmapModeKey. We may modulate the key when
+    // NB: Does not include DispatchKey::FuncTorchVmapMode. We may modulate the key when
     // constructing the DynamicLayer, but we don't control it when entering/exiting
     // the DynamicLayer.
-    return DispatchKeySet({kBatchedKey});
+    return DispatchKeySet({DispatchKey::FuncTorchBatched});
   } else if (key == TransformType::Grad || key == TransformType::Jvp) {
     return autograd_dispatch_keyset.add(DispatchKey::ADInplaceOrView);
   } else if (key == TransformType::Functionalize) {
@@ -49,7 +49,7 @@ static DispatchKeySet keysForEnteringDynamicLayer(TransformType key) {
 
 DispatchKeySet keysToExcludeWhenEnteringDynamicLayer(TransformType key) {
   DispatchKeySet exclude = all_dynlayer_keyset;
-  exclude = exclude.remove(kDynamicLayerBackModeKey);
+  exclude = exclude.remove(DispatchKey::FuncTorchDynamicLayerBackMode);
   exclude = exclude - keysForEnteringDynamicLayer(key);
   return exclude;
 }
diff --git a/functorch/functorch/csrc/Interpreter.h b/functorch/csrc/Interpreter.h
similarity index 99%
rename from functorch/functorch/csrc/Interpreter.h
rename to functorch/csrc/Interpreter.h
index 630bc1736739..73467b15575b 100644
--- a/functorch/functorch/csrc/Interpreter.h
+++ b/functorch/csrc/Interpreter.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <functorch/csrc/Macros.h>
-#include <functorch/csrc/Constants.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <c10/core/impl/LocalDispatchKeySet.h>
 #include <c10/util/Optional.h>
diff --git a/functorch/functorch/csrc/LegacyBatchingRegistrations.cpp b/functorch/csrc/LegacyBatchingRegistrations.cpp
similarity index 94%
rename from functorch/functorch/csrc/LegacyBatchingRegistrations.cpp
rename to functorch/csrc/LegacyBatchingRegistrations.cpp
index cb99ca28e4a0..c694602935a3 100644
--- a/functorch/functorch/csrc/LegacyBatchingRegistrations.cpp
+++ b/functorch/csrc/LegacyBatchingRegistrations.cpp
@@ -14,7 +14,6 @@
 #include <functorch/csrc/BatchingMetaprogramming.h>
 #include <functorch/csrc/LegacyVmapTransforms.h>
 #include <functorch/csrc/BatchedFallback.h>
-#include <functorch/csrc/Constants.h>
 #include <functorch/csrc/BatchRulesHelper.h>
 
 namespace at {
@@ -23,6 +22,10 @@ namespace functorch {
 
 // NOTE: [What is a batching rule?]
 //
+// NB: the following description only applies to this file and is about
+// the legacy (deprecated) batching rule API. Please see writing_batch_rules.md
+// for how to write new-style batching rules.
+//
 // This files contains batching rules written with the legacy (now-deprecated)
 // batching rule API.
 // Please try to use the new-style batching rule API (see writing_batch_rules.md)
@@ -61,13 +64,6 @@ namespace functorch {
 // to do steps (1), (2), and (4).
 // (see NOTE: [What is an VmapTransform?] in VmapTransforms.h)
 
-// Note: [Future plans]
-// The API for writing a batching rule isn't stable. In the future, we'd like
-// to think about the problem of translating these batching rules to TorchScript.
-// Ideally batching rules in eager mode vs TorchScript would look pretty similar,
-// if not use the same mechanism. In order to accomplish that we might have to
-// do some refactoring.
-
 // PyTorch allows operations to specify dim 0 and dim -1 on a scalar tensor.
 static bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
   return dim == 0 || dim == -1;
@@ -109,7 +105,7 @@ bool isPhysicalScalarTensor(const Tensor& logical_tensor) {
 
 std::vector<Tensor> chunk_batching_rule(const Tensor& self, int64_t chunks, int64_t dim) {
   if (!participatesInCurrentLevel(self)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return self.chunk(chunks, dim);
   }
 
@@ -122,7 +118,7 @@ std::vector<Tensor> chunk_batching_rule(const Tensor& self, int64_t chunks, int6
 
 std::vector<Tensor> tensor_split_sections_batching_rule(const Tensor& self, int64_t sections, int64_t dim) {
   if (!participatesInCurrentLevel(self)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return at::tensor_split(self, sections, dim);
   }
   auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
@@ -134,7 +130,7 @@ std::vector<Tensor> tensor_split_sections_batching_rule(const Tensor& self, int6
 
 std::vector<Tensor> tensor_split_indices_batching_rule(const Tensor& self, IntArrayRef indices, int64_t dim) {
   if (!participatesInCurrentLevel(self)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return at::tensor_split(self, indices, dim);
   }
   auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
@@ -146,7 +142,7 @@ std::vector<Tensor> tensor_split_indices_batching_rule(const Tensor& self, IntAr
 
 Tensor& squeeze_dim__batching_rule(Tensor& self, int64_t dim) {
   if (!participatesInCurrentLevel(self)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return self.squeeze_(dim);
   }
   auto* batched = maybeGetBatchedImpl(self);
@@ -180,7 +176,7 @@ Tensor& squeeze_dim__batching_rule(Tensor& self, int64_t dim) {
 
 Tensor& squeeze__batching_rule(Tensor& self) {
   if (!participatesInCurrentLevel(self)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return self.squeeze_();
   }
   auto* batched = maybeGetBatchedImpl(self);
@@ -217,7 +213,7 @@ Tensor& squeeze__batching_rule(Tensor& self) {
 
 Tensor& unsqueeze__batching_rule(Tensor& self, int64_t dim) {
   if (!participatesInCurrentLevel(self)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return self.unsqueeze_(dim);
   }
   auto* batched = maybeGetBatchedImpl(self);
@@ -237,7 +233,7 @@ Tensor& unsqueeze__batching_rule(Tensor& self, int64_t dim) {
 
 Tensor& transpose__batching_rule(Tensor& self, int64_t dim0, int64_t dim1) {
   if (!participatesInCurrentLevel(self)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return self.transpose_(dim0, dim1);
   }
   auto* batched = maybeGetBatchedImpl(self);
@@ -269,7 +265,7 @@ Tensor& transpose__batching_rule(Tensor& self, int64_t dim0, int64_t dim1) {
 
 Tensor& fill_inplace_scalar_batching_rule(Tensor& self, Scalar value) {
   if (!participatesInCurrentLevel(self)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return self.fill_(value);
   }
   auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
@@ -299,7 +295,7 @@ Tensor& zero_inplace_batching_rule(Tensor &self) {
 
 Tensor transpose_int_batching_rule(const Tensor& self, int64_t dim0, int64_t dim1) {
   if (!participatesInCurrentLevel(self)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return at::transpose(self, dim0, dim1);
   }
   // PyTorch has a special case where scalar_tensor.transpose(dim0, dim1) works
@@ -324,7 +320,7 @@ static int64_t getGradInputPhysicalDim(int64_t dim, IntArrayRef input_sizes, int
 
 Tensor select_backward_batching_rule(const Tensor& grad, IntArrayRef input_sizes, int64_t dim, int64_t index) {
   if (!participatesInCurrentLevel(grad)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return at::select_backward(grad, input_sizes, dim, index);
   }
   auto grad_physical = MultiBatchVmapTransform::logicalToPhysical(grad);
@@ -336,7 +332,7 @@ Tensor select_backward_batching_rule(const Tensor& grad, IntArrayRef input_sizes
 
 Tensor slice_backward_batching_rule(const Tensor& grad, IntArrayRef input_sizes, int64_t dim, int64_t start, int64_t end, int64_t step) {
   if (!participatesInCurrentLevel(grad)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return at::slice_backward(grad, input_sizes, dim, start, end, step);
   }
   auto grad_physical = MultiBatchVmapTransform::logicalToPhysical(grad);
@@ -348,7 +344,7 @@ Tensor slice_backward_batching_rule(const Tensor& grad, IntArrayRef input_sizes,
 
 std::vector<Tensor> split_batching_rule(const Tensor& self, int64_t split_size, int64_t dim) {
   if (!participatesInCurrentLevel(self)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return at::split(self, split_size, dim);
   }
   auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
@@ -360,7 +356,7 @@ std::vector<Tensor> split_batching_rule(const Tensor& self, int64_t split_size,
 
 std::vector<Tensor> split_with_sizes_batching_rule(const Tensor& self, IntArrayRef split_sizes, int64_t dim) {
   if (!participatesInCurrentLevel(self)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return split_with_sizes(self, split_sizes, dim);
   }
   auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
@@ -372,7 +368,7 @@ std::vector<Tensor> split_with_sizes_batching_rule(const Tensor& self, IntArrayR
 
 std::vector<Tensor> unbind_batching_rule(const Tensor& self, int64_t dim) {
   if (!participatesInCurrentLevel(self)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return at::unbind(self, dim);
   }
   auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
@@ -487,7 +483,7 @@ Tensor as_strided_batching_rule(
     IntArrayRef strides,
     optional<int64_t> storage_offset) {
   if (!participatesInCurrentLevel(tensor)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return at::as_strided(tensor, sizes, strides, storage_offset);
   }
   auto physical_view = MultiBatchVmapTransform::logicalToPhysical(tensor);
@@ -615,7 +611,7 @@ Tensor as_strided_batching_rule(
 template <typename F, F Func, typename... ExtraArgs>
 Tensor unwrap_and_call(const Tensor& input, ExtraArgs... args) {
   if (!participatesInCurrentLevel(input)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return Func(input, args...);
   }
   // guard against the user passing in a batch of scalar tensors with batch
@@ -627,7 +623,7 @@ Tensor unwrap_and_call(const Tensor& input, ExtraArgs... args) {
 template <typename F, F Func, typename... ExtraArgs>
 Tensor unwrap_and_call_method(const Tensor& input, ExtraArgs... extra_args) {
   if (!participatesInCurrentLevel(input)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return (input.*Func)(extra_args...);
   }
   auto* input_batched = unsafeGetBatchedImpl(input);
@@ -637,7 +633,7 @@ Tensor unwrap_and_call_method(const Tensor& input, ExtraArgs... extra_args) {
 
 Tensor cat_batching_rule(TensorList tensors, int64_t dim) {
   if (!participatesInCurrentLevel(tensors)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return at::cat(tensors, dim);
   }
   auto physical_views = MultiBatchVmapTransform::logicalToPhysical(tensors);
@@ -651,7 +647,7 @@ Tensor cat_batching_rule(TensorList tensors, int64_t dim) {
 
 Tensor block_diag_batching_rule(TensorList tensors) {
   if (!participatesInCurrentLevel(tensors)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return at::block_diag(tensors);
   }
   auto physical_views = MultiBatchVmapTransform::logicalToPhysical(tensors);
@@ -679,7 +675,7 @@ Tensor block_diag_batching_rule(TensorList tensors) {
 
 Tensor stack_batching_rule(TensorList tensors, int64_t dim) {
   if (!participatesInCurrentLevel(tensors)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return at::stack(tensors, dim);
   }
   auto physical_views = MultiBatchVmapTransform::logicalToPhysical(tensors);
@@ -697,14 +693,17 @@ Tensor stack_batching_rule(TensorList tensors, int64_t dim) {
 
 Tensor new_empty_strided_batching_rule(
     const Tensor& self,
-    IntArrayRef size,
-    IntArrayRef stride,
+    SymIntArrayRef sym_size,
+    SymIntArrayRef sym_stride,
     optional<ScalarType> dtype,
     optional<Layout> layout,
     optional<Device> device,
     optional<bool> pin_memory) {
+
+  auto size = c10::asIntArrayRefSlow(sym_size);
+  auto stride = c10::asIntArrayRefSlow(sym_stride);
   if (!participatesInCurrentLevel(self)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     return self.new_empty_strided(
         size, stride, dtype, layout, device, pin_memory);
   }
@@ -774,11 +773,11 @@ Tensor& BatchedTensor_requires_grad_(Tensor& self, bool requires_grad) {
 }
 
 
-TORCH_LIBRARY_IMPL(_, FT_BATCHED_KEY, m) {
+TORCH_LIBRARY_IMPL(_, FuncTorchBatched, m) {
   m.fallback(torch::CppFunction::makeFromBoxedFunction<&batchedTensorForLoopFallback>());
 }
 
-TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
+TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   // still legacy b/c teturns multiple tensors
   m.impl("tensor_split.sections", tensor_split_sections_batching_rule);
   m.impl("tensor_split.indices", tensor_split_indices_batching_rule);
diff --git a/functorch/functorch/csrc/LegacyVmapTransforms.cpp b/functorch/csrc/LegacyVmapTransforms.cpp
similarity index 95%
rename from functorch/functorch/csrc/LegacyVmapTransforms.cpp
rename to functorch/csrc/LegacyVmapTransforms.cpp
index 3b57bd35e52e..6d3b6e406e43 100644
--- a/functorch/functorch/csrc/LegacyVmapTransforms.cpp
+++ b/functorch/csrc/LegacyVmapTransforms.cpp
@@ -134,12 +134,12 @@ MultiBatchVmapTransform::logicalToPhysical(TensorList logical_tensors) {
     auto* batched = maybeGetBatchedImpl(logical_tensor);
     if (!batched || (batched->level() != cur_level)) {
       // Unsqueeze dim 0, expand it to the correct shape
-      c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+      c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
       auto value = moveDimToFrontAndExpand(logical_tensor, {}, bdim_size);
       result.emplace_back(std::move(value), levels);
       continue;
     }
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     auto physical = batched->value();
     auto value = moveDimToFrontAndExpand(physical, batched->bdim(), bdim_size);
     result.emplace_back(std::move(value), levels);
@@ -189,12 +189,12 @@ VmapPhysicalViewVec BroadcastingVmapTransform::logicalToPhysical(TensorList logi
     auto* batched = maybeGetBatchedImpl(logical_tensor);
     if (!batched || (batched->level() != cur_level)) {
       // Unsqueeze dim 0, expand it to the correct shape
-      c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+      c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
       auto value = moveDimToFrontAndUnsqueeze(logical_tensor, {}, max_example_dim);
       result.emplace_back(std::move(value), levels);
       continue;
     }
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
     auto physical = batched->value();
     auto value = moveDimToFrontAndUnsqueeze(physical, batched->bdim(), max_example_dim);
     result.emplace_back(std::move(value), levels);
diff --git a/functorch/functorch/csrc/LegacyVmapTransforms.h b/functorch/csrc/LegacyVmapTransforms.h
similarity index 99%
rename from functorch/functorch/csrc/LegacyVmapTransforms.h
rename to functorch/csrc/LegacyVmapTransforms.h
index 443c4e867de2..00ecd5b60d88 100644
--- a/functorch/functorch/csrc/LegacyVmapTransforms.h
+++ b/functorch/csrc/LegacyVmapTransforms.h
@@ -7,7 +7,7 @@
 #pragma once
 
 #include <functorch/csrc/Macros.h>
-#include <functorch/csrc/BatchedTensorImpl.h>
+#include <ATen/functorch/BatchedTensorImpl.h>
 
 namespace at {
 namespace functorch {
diff --git a/functorch/functorch/csrc/Macros.h b/functorch/csrc/Macros.h
similarity index 89%
rename from functorch/functorch/csrc/Macros.h
rename to functorch/csrc/Macros.h
index 9ca13023fc92..401f2bbd47e1 100644
--- a/functorch/functorch/csrc/Macros.h
+++ b/functorch/csrc/Macros.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#define SINGLE_ARG(...) __VA_ARGS__
+
 // FUNCTORCH_BUILD_MAIN_LIB is set in setup.py.
 // We don't really need to use C10_IMPORT because no C++ project relies on
 // functorch. But leaving it here for future-proofing.
diff --git a/functorch/functorch/csrc/PlumbingHelper.cpp b/functorch/csrc/PlumbingHelper.cpp
similarity index 98%
rename from functorch/functorch/csrc/PlumbingHelper.cpp
rename to functorch/csrc/PlumbingHelper.cpp
index e75fb82a3864..738185b230bd 100644
--- a/functorch/functorch/csrc/PlumbingHelper.cpp
+++ b/functorch/csrc/PlumbingHelper.cpp
@@ -6,7 +6,7 @@
 
 #include <functorch/csrc/TensorWrapper.h>
 #include <functorch/csrc/DynamicLayer.h>
-#include <functorch/csrc/BatchedTensorImpl.h>
+#include <ATen/functorch/BatchedTensorImpl.h>
 
 namespace at { namespace functorch {
 
diff --git a/functorch/functorch/csrc/PlumbingHelper.h b/functorch/csrc/PlumbingHelper.h
similarity index 56%
rename from functorch/functorch/csrc/PlumbingHelper.h
rename to functorch/csrc/PlumbingHelper.h
index 8a8441c3bb29..4a1716d921fc 100644
--- a/functorch/functorch/csrc/PlumbingHelper.h
+++ b/functorch/csrc/PlumbingHelper.h
@@ -5,15 +5,37 @@
 // LICENSE file in the root directory of this source tree.
 #pragma once
 #include <ATen/Tensor.h>
-#include <functorch/csrc/BatchedTensorImpl.h>
-#include <functorch/csrc/Constants.h>
+#include <ATen/functorch/BatchedTensorImpl.h>
 #include <functorch/csrc/DynamicLayer.h>
 
+// NOTE: [vmap plumbing]
+//
+// Here's how "batching rules" work.
+// - we register kernels to the Batched key
+// - these kernels have the same signatures as the original operators.
+//   For example, at::sin(Tensor self) accepts a Tensor, and the batched kernel
+//   must also accept a Tensor
+// - However, it is more natural for users to write a batching rule like the
+//   following: sin_batch_rule(Tensor self, optional<int> self_bdim)
+// - There is some codegenerated layer (the "plumbing") that wraps the user
+//   defined batching rule (e.g. sin_batch_rule) in a kernel that can be
+//   registered to the Batched key.
+//
+// The plumbing is responsible for wrapping a batching rule into a form that may
+// be registered as the kernel for the batched key.
+
 namespace at { namespace functorch {
 
+// Create a BatchedTensor given a tensor, bdim, and level
 Tensor makeBatched(const Tensor& tensor, optional<int64_t> bdim, int64_t level);
+
+// Given a Tensor that may or may not be a BatchedTensor, unwrap it.
+// If `tensor` is not a BatchedTensor, or is a BatchedTensor but the level
+// doesn't match, then this returns (tensor, nullopt).
+// Otherwise, it returns (unwrap(tensor), bdim).
 std::tuple<Tensor, optional<int64_t>> unwrapTensorAtLevel(const Tensor& tensor, int64_t level);
 
+// Creates a vector of BatchedTensor
 std::vector<Tensor> makeBatchedVector(const std::vector<Tensor>& tensors, optional<int64_t> bdim, int64_t level);
 
 // Returns True if ANY tensor in tensors is batched at level
diff --git a/functorch/functorch/csrc/PyTorchOperatorHacks.cpp b/functorch/csrc/PyTorchOperatorHacks.cpp
similarity index 98%
rename from functorch/functorch/csrc/PyTorchOperatorHacks.cpp
rename to functorch/csrc/PyTorchOperatorHacks.cpp
index 0bde1f53d254..75c33f1e3497 100644
--- a/functorch/functorch/csrc/PyTorchOperatorHacks.cpp
+++ b/functorch/csrc/PyTorchOperatorHacks.cpp
@@ -1,10 +1,9 @@
 #include <functorch/csrc/DynamicLayer.h>
-#include <functorch/csrc/Constants.h>
 #include <torch/library.h>
 #include <ATen/ATen.h>
 #include <ATen/WrapDimUtils.h>
 #include <functorch/csrc/TensorWrapper.h>
-#include <functorch/csrc/BatchedTensorImpl.h>
+#include <ATen/functorch/BatchedTensorImpl.h>
 #include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <c10/util/irange.h>
@@ -13,14 +12,16 @@
 
 namespace at { namespace functorch {
 
-// TODO: all of these should be fixed in a more blessed way. In particular,
-// it is bad if any of these go out-of-sync with the implementations in
-// pytorch/pytorch.
+// NOTE: [functorch's PyTorch Operator Hacks]
 //
 // This file contains hacks for composite PyTorch operators that are problematic.
 // For example, the composite op might have in-place operations,
 // or call data_ptr. We have some idea of how to fix these things in the long term
-// (e.g. functionalization for the in-place operations).
+// e.g., upstream the changes to PyTorch.
+//
+// TODO: all of these should be fixed in a more blessed way. In particular,
+// it is bad if any of these go out-of-sync with the implementations in
+// pytorch/pytorch.
 
 // TODO: upstream into core
 Tensor index_select_backward_hack(const Tensor& grad, IntArrayRef self_sizes, int64_t dim, const Tensor& index) {
@@ -288,7 +289,7 @@ Tensor& feature_alpha_dropout_(Tensor& input, double p, bool train) {
 
 } // dropout_hack
 
-TORCH_LIBRARY_IMPL(aten, FT_DYNAMIC_LAYER_FRONT_MODE_KEY, m) {
+TORCH_LIBRARY_IMPL(aten, FuncTorchDynamicLayerFrontMode, m) {
   m.impl("index_select_backward", index_select_backward_hack);
   m.impl("linear", linear_hack);
   m.impl("binary_cross_entropy_with_logits", binary_cross_entropy_with_logits_hack);
diff --git a/functorch/functorch/csrc/TensorWrapper.cpp b/functorch/csrc/TensorWrapper.cpp
similarity index 94%
rename from functorch/functorch/csrc/TensorWrapper.cpp
rename to functorch/csrc/TensorWrapper.cpp
index 054be6495c37..c4b6eac25791 100644
--- a/functorch/functorch/csrc/TensorWrapper.cpp
+++ b/functorch/csrc/TensorWrapper.cpp
@@ -6,7 +6,7 @@
 
 #include <functorch/csrc/TensorWrapper.h>
 #include <functorch/csrc/DynamicLayer.h>
-#include <functorch/csrc/BatchedTensorImpl.h>
+#include <ATen/functorch/BatchedTensorImpl.h>
 
 #include <torch/library.h>
 #include <ATen/core/dispatch/Dispatcher.h>
@@ -62,7 +62,7 @@ c10::intrusive_ptr<TensorWrapper> makeTensorWrapperPtr(const Tensor& tensor, int
   auto keys_to_propagate = kKeysToPropagateToWrapper | DispatchKeySet({
       DispatchKey::AutogradCPU, DispatchKey::AutogradCUDA, DispatchKey::AutogradXLA});
   auto key_set = getKeysToPropagateToWrapper(tensor, keys_to_propagate);
-  key_set = key_set.add(kGradWrapperKey);
+  key_set = key_set.add(DispatchKey::FuncTorchGradWrapper);
   if (should_be_alive) {
     return c10::make_intrusive<TensorWrapper>(key_set, tensor, level, getLifeHandleForLevel(level));
   } else {
@@ -79,10 +79,10 @@ Tensor makeTensorWrapper(const Tensor& tensor, int64_t level) {
   auto keys_to_propagate = kKeysToPropagateToWrapper | DispatchKeySet({
       DispatchKey::AutogradCPU, DispatchKey::AutogradCUDA, DispatchKey::AutogradXLA});
   auto key_set = getKeysToPropagateToWrapper(tensor, keys_to_propagate);
-  key_set = key_set.add(kGradWrapperKey);
+  key_set = key_set.add(DispatchKey::FuncTorchGradWrapper);
   auto life_handle = getLifeHandleForLevel(level);
   auto result = at::detail::make_tensor<TensorWrapper>(key_set, tensor, level, std::move(life_handle));
-  TORCH_INTERNAL_ASSERT(result.key_set().has(kGradWrapperKey));
+  TORCH_INTERNAL_ASSERT(result.key_set().has(DispatchKey::FuncTorchGradWrapper));
   return result;
 }
 
@@ -154,7 +154,7 @@ const char* TensorWrapper::tensorimpl_type_name() const {
 
 
 TensorWrapper* maybeGetTensorWrapper(const Tensor& tensor) {
-  if (!tensor.key_set().has(kGradWrapperKey)) {
+  if (!tensor.key_set().has(DispatchKey::FuncTorchGradWrapper)) {
     return nullptr;
   }
   return (TensorWrapper*)(tensor.unsafeGetTensorImpl());
@@ -184,7 +184,7 @@ void dead_tensor_wrapper_fallback(const c10::OperatorHandle& op, torch::jit::Sta
 
 // TensorWrapper backend fallback: Unwrap and fallthrough.
 
-TORCH_LIBRARY_IMPL(_, FT_GRAD_WRAPPER_KEY, m) {
+TORCH_LIBRARY_IMPL(_, FuncTorchGradWrapper, m) {
   m.fallback(torch::CppFunction::makeFromBoxedFunction<&dead_tensor_wrapper_fallback>());
 }
 
diff --git a/functorch/functorch/csrc/TensorWrapper.h b/functorch/csrc/TensorWrapper.h
similarity index 62%
rename from functorch/functorch/csrc/TensorWrapper.h
rename to functorch/csrc/TensorWrapper.h
index 7abfe1782d38..9c2db3d8edfc 100644
--- a/functorch/functorch/csrc/TensorWrapper.h
+++ b/functorch/csrc/TensorWrapper.h
@@ -12,6 +12,26 @@
 namespace at {
 namespace functorch {
 
+// NOTE: [functorch's TensorWrapper]
+//
+// Taking better suggestions for a name. TensorWrapper is the wrapper Tensor
+// Subclass for functorch's grad-based transforms (grad, vjp, jvp). It is
+// analogous to how vmap uses BatchedTensor as the wrapper Tensor subclass.
+//
+// If you're familiar with the Tensor-Variable merge, TensorWrapper is effectively
+// another Variable.
+//
+// Consider grad(grad(torch.sin))(x). This wraps `x` as TensorWrapper(TensorWrapper(x)).
+// The reason why is so that each TensorWrapper can hold its own AutogradMeta and
+// participate in a **separate** autograd graph.
+//
+// There are alternative designs we could have chosen (e.g. each grad transform
+// stores a weak map of Tensor -> AutogradMeta); the benefit of the TensorWrapper
+// design is that we can re-use existing VariableType kernels (i.e. Autograd kernels)
+// without much modification. Since a TensorWrapper looks like a regular Tensor,
+// the VariableType kernel can pull out the AutogradMeta struct from where it
+// expects and extend the autograd graph
+
 struct FUNCTORCH_API TensorWrapper : public c10::TensorImpl {
   explicit TensorWrapper(
       c10::DispatchKeySet key_set,
@@ -52,6 +72,11 @@ struct FUNCTORCH_API TensorWrapper : public c10::TensorImpl {
   Tensor value_;
   int64_t level_;
 
+  // TensorWrapper receives a boolean flag on whether or not the Grad Interpreter
+  // that created it is still alive or not.
+  // If the Grad Interpreter is no longer alive then it attempts to behave like
+  // a regular Tensor.
+  //
   // When we exit the level, this wrapper may be marked as "not alive".
   // Wrappers that are not alive:
   // 1) May still have autograd metadata on them
diff --git a/functorch/functorch/csrc/VmapInterpreter.cpp b/functorch/csrc/VmapInterpreter.cpp
similarity index 88%
rename from functorch/functorch/csrc/VmapInterpreter.cpp
rename to functorch/csrc/VmapInterpreter.cpp
index a8f0283aa3b7..0972019eb031 100644
--- a/functorch/functorch/csrc/VmapInterpreter.cpp
+++ b/functorch/csrc/VmapInterpreter.cpp
@@ -7,7 +7,7 @@ void VmapInterpreterPtr::processImpl(
     const c10::OperatorHandle& op,
     torch::jit::Stack* stack) {
   DispatchKeySet exclude = keysToExcludeWhenEnteringDynamicLayer(TransformType::Vmap);
-  setup_dispatch_key_tls(exclude, DispatchKeySet(kVmapModeKey));
+  setup_dispatch_key_tls(exclude, DispatchKeySet(DispatchKey::FuncTorchVmapMode));
   op.callBoxed(stack);
 }
 
diff --git a/functorch/functorch/csrc/VmapInterpreter.h b/functorch/csrc/VmapInterpreter.h
similarity index 85%
rename from functorch/functorch/csrc/VmapInterpreter.h
rename to functorch/csrc/VmapInterpreter.h
index 084cea956b28..59336298aa98 100644
--- a/functorch/functorch/csrc/VmapInterpreter.h
+++ b/functorch/csrc/VmapInterpreter.h
@@ -3,6 +3,9 @@
 
 namespace at { namespace functorch {
 
+// This is the interpreter that handles the functionalize() transform.
+// See NOTE: [functorch interpreter stack] for more details.
+
 struct VmapInterpreterPtr {
   explicit VmapInterpreterPtr(const Interpreter* base): base_(base) { TORCH_INTERNAL_ASSERT(base->key() == TransformType::Vmap); }
   TransformType key() const { return base_->key(); }
diff --git a/functorch/functorch/csrc/VmapModeRegistrations.cpp b/functorch/csrc/VmapModeRegistrations.cpp
similarity index 88%
rename from functorch/functorch/csrc/VmapModeRegistrations.cpp
rename to functorch/csrc/VmapModeRegistrations.cpp
index 922b06e93db4..8548a5c2518d 100644
--- a/functorch/functorch/csrc/VmapModeRegistrations.cpp
+++ b/functorch/csrc/VmapModeRegistrations.cpp
@@ -7,12 +7,16 @@
 #include <torch/library.h>
 #include <ATen/ATen.h>
 #include <functorch/csrc/LegacyVmapTransforms.h>
-#include <functorch/csrc/BatchedTensorImpl.h>
+#include <ATen/functorch/BatchedTensorImpl.h>
 #include <functorch/csrc/PlumbingHelper.h>
-#include <functorch/csrc/Constants.h>
 #include <functorch/csrc/DynamicLayer.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 
+// functorch's vmap has two Dispatch Keys that implement it:
+// FuncTorchBatched and FuncTorchVmapMode. This file contains registrations for
+// FuncTorchVmapMode -- these registrations are to error out on operations
+// that we don't support on regular Tensors.
+
 namespace at {
 namespace functorch {
 
diff --git a/functorch/functorch/csrc/dim/arena.h b/functorch/csrc/dim/arena.h
similarity index 100%
rename from functorch/functorch/csrc/dim/arena.h
rename to functorch/csrc/dim/arena.h
diff --git a/functorch/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp
similarity index 99%
rename from functorch/functorch/csrc/dim/dim.cpp
rename to functorch/csrc/dim/dim.cpp
index 907554e861c6..37cd6a0d8d20 100644
--- a/functorch/functorch/csrc/dim/dim.cpp
+++ b/functorch/csrc/dim/dim.cpp
@@ -13,7 +13,7 @@
 #include <vector>
 //#include <torch/csrc/autograd/python_variable.h>
 #include <torch/csrc/Export.h>
-#include <functorch/csrc/BatchedTensorImpl.h>
+#include <ATen/functorch/BatchedTensorImpl.h>
 #include <functorch/csrc/DynamicLayer.h>
 #include <ATen/ATen.h>
 #include <memory>
diff --git a/functorch/functorch/csrc/dim/dim.h b/functorch/csrc/dim/dim.h
similarity index 100%
rename from functorch/functorch/csrc/dim/dim.h
rename to functorch/csrc/dim/dim.h
diff --git a/functorch/functorch/csrc/dim/minpybind.h b/functorch/csrc/dim/minpybind.h
similarity index 99%
rename from functorch/functorch/csrc/dim/minpybind.h
rename to functorch/csrc/dim/minpybind.h
index 659bd98d7a28..dd0edfe5d5a3 100644
--- a/functorch/functorch/csrc/dim/minpybind.h
+++ b/functorch/csrc/dim/minpybind.h
@@ -617,7 +617,8 @@ struct vector_args {
             }
             *format_it++ = '\0';
             _PyArg_Parser* _parser = new _PyArg_Parser{format_str, &names_buf[0], fname_cstr, 0};
-            _PyArg_ParseStackAndKeywords((PyObject*const*)args, nargs, kwnames.ptr(), _parser);
+            PyObject *dummy = NULL;
+            _PyArg_ParseStackAndKeywords((PyObject*const*)args, nargs, kwnames.ptr(), _parser, &dummy, &dummy, &dummy, &dummy, &dummy);
 #else
             _PyArg_Parser* _parser = new _PyArg_Parser{NULL, &names_buf[0], fname_cstr, 0};
             std::unique_ptr<PyObject*[]> buf(new PyObject*[names.size()]);
diff --git a/functorch/functorch/csrc/dim/python_variable_simple.h b/functorch/csrc/dim/python_variable_simple.h
similarity index 100%
rename from functorch/functorch/csrc/dim/python_variable_simple.h
rename to functorch/csrc/dim/python_variable_simple.h
diff --git a/functorch/functorch/csrc/init.cpp b/functorch/csrc/init.cpp
similarity index 98%
rename from functorch/functorch/csrc/init.cpp
rename to functorch/csrc/init.cpp
index 0dd7ed110343..c7080359ed1b 100644
--- a/functorch/functorch/csrc/init.cpp
+++ b/functorch/csrc/init.cpp
@@ -10,15 +10,16 @@
 
 #include <functorch/csrc/TensorWrapper.h>
 #include <functorch/csrc/DynamicLayer.h>
-#include <functorch/csrc/BatchedTensorImpl.h>
+#include <ATen/functorch/BatchedTensorImpl.h>
 #include <functorch/csrc/LegacyVmapTransforms.h>
 #include <functorch/csrc/BatchedFallback.h>
 #include <functorch/csrc/BatchRulesHelper.h>
 #include <functorch/csrc/CompileCache.h>
-#include <functorch/csrc/CustomFunction.h>
 #include <c10/core/AutogradState.h>
 #include <functorch/csrc/dim/dim.h>
 
+// This file contains functorch's Python bindings.
+
 namespace at {
 namespace functorch {
 
@@ -334,11 +335,11 @@ static std::tuple<Tensor, int64_t> unwrapTensorAtCurrentLevel(const Tensor& tens
 }
 
 static void tls_set_vmap_excluded(bool excluded) {
-  c10::impl::tls_set_dispatch_key_excluded(kBatchedKey, excluded);
+  c10::impl::tls_set_dispatch_key_excluded(DispatchKey::FuncTorchBatched, excluded);
 }
 
 static bool tls_set_is_included() {
-  return c10::impl::tls_is_dispatch_key_included(kDynamicLayerFrontModeKey);
+  return c10::impl::tls_is_dispatch_key_included(DispatchKey::FuncTorchDynamicLayerFrontMode);
 }
 
 static void _set_dynamic_layer_keys_included(bool value) {
@@ -414,11 +415,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     throw py::error_already_set();
   }
   py::setattr(m, "dim", py::reinterpret_steal<py::object>(dim));
-
-  // Windows doesn't like this
-#ifndef _WIN32
-  initDispatchBindings(m.ptr());
-#endif
 }
 
 }}
diff --git a/functorch/functorch/dim/README.md b/functorch/dim/README.md
similarity index 100%
rename from functorch/functorch/dim/README.md
rename to functorch/dim/README.md
diff --git a/functorch/functorch/dim/__init__.py b/functorch/dim/__init__.py
similarity index 100%
rename from functorch/functorch/dim/__init__.py
rename to functorch/dim/__init__.py
diff --git a/functorch/functorch/dim/batch_tensor.py b/functorch/dim/batch_tensor.py
similarity index 100%
rename from functorch/functorch/dim/batch_tensor.py
rename to functorch/dim/batch_tensor.py
diff --git a/functorch/functorch/dim/delayed_mul_tensor.py b/functorch/dim/delayed_mul_tensor.py
similarity index 100%
rename from functorch/functorch/dim/delayed_mul_tensor.py
rename to functorch/dim/delayed_mul_tensor.py
diff --git a/functorch/functorch/dim/dim.py b/functorch/dim/dim.py
similarity index 100%
rename from functorch/functorch/dim/dim.py
rename to functorch/dim/dim.py
diff --git a/functorch/functorch/dim/magic_trace.py b/functorch/dim/magic_trace.py
similarity index 100%
rename from functorch/functorch/dim/magic_trace.py
rename to functorch/dim/magic_trace.py
diff --git a/functorch/functorch/dim/op_properties.py b/functorch/dim/op_properties.py
similarity index 100%
rename from functorch/functorch/dim/op_properties.py
rename to functorch/dim/op_properties.py
diff --git a/functorch/functorch/dim/reference.py b/functorch/dim/reference.py
similarity index 100%
rename from functorch/functorch/dim/reference.py
rename to functorch/dim/reference.py
diff --git a/functorch/functorch/dim/tree_map.py b/functorch/dim/tree_map.py
similarity index 100%
rename from functorch/functorch/dim/tree_map.py
rename to functorch/dim/tree_map.py
diff --git a/functorch/functorch/dim/wrap_type.py b/functorch/dim/wrap_type.py
similarity index 100%
rename from functorch/functorch/dim/wrap_type.py
rename to functorch/dim/wrap_type.py
diff --git a/functorch/functorch/experimental/__init__.py b/functorch/experimental/__init__.py
similarity index 100%
rename from functorch/functorch/experimental/__init__.py
rename to functorch/experimental/__init__.py
diff --git a/functorch/functorch/experimental/batch_norm_replacement.py b/functorch/experimental/batch_norm_replacement.py
similarity index 100%
rename from functorch/functorch/experimental/batch_norm_replacement.py
rename to functorch/experimental/batch_norm_replacement.py
diff --git a/functorch/functorch/experimental/cond.py b/functorch/experimental/cond.py
similarity index 84%
rename from functorch/functorch/experimental/cond.py
rename to functorch/experimental/cond.py
index a4eff7ead194..95cc387f4c72 100644
--- a/functorch/functorch/experimental/cond.py
+++ b/functorch/experimental/cond.py
@@ -1,11 +1,12 @@
 import torch
 from torch._C import DispatchKey, DispatchKeySet, ExcludeDispatchKeyGuard
-from functorch.experimental.ops import PyOperator, fallthrough_fn
+from torch._ops import PyOperator
 from torch.utils._pytree import tree_flatten
 from torch.fx.experimental.proxy_tensor import get_isolated_graphmodule, get_proxy_slot
 import torch.utils._pytree as pytree
 from torch.utils._python_dispatch import TorchDispatchMode
 from torch.fx.experimental.proxy_tensor import track_tensor_tree
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode
 
 
 """
@@ -14,6 +15,9 @@
 """
 from contextlib import contextmanager
 
+cond = PyOperator('cond')
+
+
 # TODO(voz): Move out somewhere else once other py dispatched ops need it
 @contextmanager
 def suspend_mode(mode):
@@ -98,6 +102,7 @@ def _unwrap_proxy(e):
     return track_tensor_tree(out, out_proxy, constant=None, tracer=proxy_mode.tracer)
 
 
+@cond.py_impl(DispatchKey.CPU)
 def cond_dense(pred, true_fn, false_fn, operands):
     mode = torch._C._get_torch_dispatch_mode()
     assert (mode is None), "Mode should never be enabled for CPU key"
@@ -107,6 +112,7 @@ def cond_dense(pred, true_fn, false_fn, operands):
         return false_fn(*operands)
 
 
+@cond.py_impl(DispatchKey.AutogradCPU)
 def cond_autograd(pred, true_fn, false_fn, *operands):
     # TODO: support autograd
     flat_operands, _ = tree_flatten([true_fn, false_fn] + [operands])
@@ -117,21 +123,16 @@ def cond_autograd(pred, true_fn, false_fn, *operands):
     return cond(pred, true_fn, false_fn, *operands)
 
 
-def python_fallback(op):
-    def inner(pred, true_fn, false_fn, operands):
-        mode = torch._C._get_torch_dispatch_mode()
-        assert (mode is not None), "Mode should always be enabled for python fallback key"
-        with suspend_mode(mode):
-            res = trace_cond(mode, op, pred, true_fn, false_fn, operands)
-        return res
-
-    return inner
+@cond.py_impl(ProxyTorchDispatchMode)
+def inner(pred, true_fn, false_fn, operands):
+    mode = torch._C._get_torch_dispatch_mode()
+    assert (mode is not None), "Mode should always be enabled for python fallback key"
+    with suspend_mode(mode):
+        res = trace_cond(mode, cond, pred, true_fn, false_fn, operands)
+    return res
 
 
-cond = PyOperator('cond')
-cond.impl(DispatchKey.CPU, cond_dense)
-cond.impl(DispatchKey.Python, python_fallback(cond))
-cond.impl(DispatchKey.PythonTLSSnapshot, fallthrough_fn)
-cond.impl(DispatchKey.AutogradCPU, cond_autograd)
-cond.impl(DispatchKey.ADInplaceOrView, fallthrough_fn)
-cond.impl(DispatchKey.BackendSelect, fallthrough_fn)
+# TODO(voz): Make this automatic for keys, this is very ugly atm
+cond.fallthrough(DispatchKey.PythonTLSSnapshot)
+cond.fallthrough(DispatchKey.ADInplaceOrView)
+cond.fallthrough(DispatchKey.BackendSelect)
diff --git a/functorch/experimental/ops.py b/functorch/experimental/ops.py
new file mode 100644
index 000000000000..42899c20526f
--- /dev/null
+++ b/functorch/experimental/ops.py
@@ -0,0 +1 @@
+from torch._ops import PyOperator  # noqa: F401
diff --git a/functorch/functorch/_src/custom_function.py b/functorch/functorch/_src/custom_function.py
deleted file mode 100644
index 028a246c62a3..000000000000
--- a/functorch/functorch/_src/custom_function.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import torch
-import functorch._C
-
-m = functorch._C._dispatch_library("FRAGMENT", "aten", "")
-
-
-def custom_vjp(name, filter_fn, fwd_fn, bwd_fn):
-    m.def_(f"{name}(Tensor[] args) -> Tensor[]")
-    m.impl(f"{name}", "CompositeImplicitAutograd", fwd_fn)
-
-    m.def_(f"{name}_vjp(Tensor[] args) -> Tensor[]")
-    m.impl(f"{name}_vjp", "CompositeImplicitAutograd", bwd_fn)
-
-    # TODO: it looks like the autograd alias key doesn't work
-    m.gen_backward_binding(f"{name}", "AutogradCPU")
-    m.gen_backward_binding(f"{name}", "AutogradCUDA")
-
-    def wrapped(*args):
-        return filter_fn(getattr(torch.ops.aten, name)(args))
-    return wrapped
diff --git a/functorch/functorch/csrc/Constants.h b/functorch/functorch/csrc/Constants.h
deleted file mode 100644
index 0d170d85b807..000000000000
--- a/functorch/functorch/csrc/Constants.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#pragma once
-#include <c10/core/DispatchKey.h>
-
-namespace at {
-namespace functorch {
-
-#define FT_BATCHED_KEY FuncTorchBatched
-#define FT_VMAP_MODE_KEY FuncTorchVmapMode
-#define FT_GRAD_WRAPPER_KEY FuncTorchGradWrapper
-#define FT_DYNAMIC_LAYER_FRONT_MODE_KEY FuncTorchDynamicLayerFrontMode
-#define FT_DYNAMIC_LAYER_BACK_MODE_KEY FuncTorchDynamicLayerBackMode
-#define FT_PYTHON_KEY FuncTorchPython
-
-constexpr auto kBatchedKey = c10::DispatchKey::FT_BATCHED_KEY;
-constexpr auto kVmapModeKey = c10::DispatchKey::FT_VMAP_MODE_KEY;
-constexpr auto kGradWrapperKey = c10::DispatchKey::FT_GRAD_WRAPPER_KEY;
-constexpr auto kDynamicLayerFrontModeKey = c10::DispatchKey::FT_DYNAMIC_LAYER_FRONT_MODE_KEY;
-constexpr auto kDynamicLayerBackModeKey = c10::DispatchKey::FT_DYNAMIC_LAYER_BACK_MODE_KEY;
-//# constexpr auto kPythonKey = c10::DispatchKey::FT_PYTHON_KEY;
-
-// Some helper macros
-#define SINGLE_ARG(...) __VA_ARGS__
-
-}} // namespace at::functorch
diff --git a/functorch/functorch/csrc/CustomFunction.cpp b/functorch/functorch/csrc/CustomFunction.cpp
deleted file mode 100644
index e203250888ef..000000000000
--- a/functorch/functorch/csrc/CustomFunction.cpp
+++ /dev/null
@@ -1,291 +0,0 @@
-#ifndef _WIN32
-#include <functorch/csrc/CustomFunction.h>
-#include <ATen/ATen.h>
-#include <torch/csrc/autograd/function.h>
-#include <torch/csrc/autograd/graph_task.h>
-#include <torch/csrc/autograd/variable.h>
-#include <torch/csrc/autograd/saved_variable.h>
-#include <torch/csrc/autograd/FunctionsManual.h>
-
-#include <torch/csrc/autograd/VariableTypeUtils.h>
-#include <torch/csrc/autograd/generated/VariableType.h>
-#include <torch/csrc/autograd/FunctionsManual.h>
-
-namespace at { namespace functorch {
-
-class PythonKernelHolder : public c10::OperatorKernel {
-  PyObject* func_;
-
-public:
-
-  PythonKernelHolder(py::object func) : func_(func.release().ptr()) {}
-  // This is a generally useful pattern and safer than directly using pybind11's
-  // py::object destructor.  This is because this object may outlive
-  // libtorch_python, so we want to disarm the deallocation if that happens.
-  // PyInterpreter does this correctly, pybind11 does not.
-  ~PythonKernelHolder() override {
-    (*getPyInterpreter())->decref(func_, /*is_tensor*/false);
-  }
-
-  void operator()(const c10::OperatorHandle& op, c10::DispatchKeySet, torch::jit::Stack* stack) {
-    const auto& schema = op.schema();
-
-    const auto num_arguments = schema.arguments().size();
-    auto arguments = torch::jit::pop(*stack, num_arguments);
-
-    // TODO: Some duplication with torch/csrc/autograd/python_variable.cpp
-
-    py::gil_scoped_acquire g;
-
-    // Pre-scan for arguments that match defaults
-    int64_t default_suffix_len = 0;
-    for (int64_t idx = arguments.size() - 1; idx >= 0; idx--) {
-      const auto& arg = schema.arguments()[idx];
-      if (!arg.default_value().has_value()) {
-        break;
-      }
-      const auto& default_ivalue = *arg.default_value();
-      const auto& ivalue = arguments[idx];
-      if (default_ivalue != ivalue) {
-        break;
-      }
-      default_suffix_len++;
-    }
-
-    auto args = py::reinterpret_steal<py::object>(PyTuple_New(num_arguments - default_suffix_len));
-        // TODO: actually populate kwargs sometimes?  At the moment, every argument
-        // // just gets passed positionally
-    py::dict kwargs;
-
-    for (int64_t idx = 0; idx < (int64_t)arguments.size() - default_suffix_len; idx++) {
-      PyTuple_SET_ITEM(args.ptr(), idx, torch::jit::toPyObject(std::move(arguments[idx])).release().ptr());
-    }
-
-    auto out = py::reinterpret_steal<py::object>(PyObject_Call(func_, args.ptr(), kwargs.ptr()));
-    if (out.ptr() == nullptr) {
-      throw python_error();
-    }
-
-    if (op.schema().returns().size() == 1) {
-      torch::jit::push(stack, torch::jit::toIValue(out.ptr(), op.schema().returns()[0].type()));
-    } else {
-      auto outs = py::cast<py::sequence>(out);
-      for (unsigned idx = 0; idx < outs.size(); idx++) {
-        torch::jit::push(stack, torch::jit::toIValue(outs[idx].ptr(), op.schema().returns()[idx].type()));
-      }
-    }
-  }
-};
-
-torch::Library::Kind parseKind(const std::string& k) {
-  static std::unordered_map<std::string, torch::Library::Kind> kind_map = {
-    {"DEF", torch::Library::DEF},
-    {"IMPL", torch::Library::IMPL},
-    {"FRAGMENT", torch::Library::FRAGMENT},
-  };
-  auto it = kind_map.find(k);
-  TORCH_CHECK(it != kind_map.end(), "could not parse ", k);
-  return it->second;
-}
-c10::AliasAnalysisKind parseAliasAnalysisKind(const std::string& k) {
-  static std::unordered_map<std::string, c10::AliasAnalysisKind> key_map = {
-    {"CONSERVATIVE", c10::AliasAnalysisKind::CONSERVATIVE},
-    {"FROM_SCHEMA", c10::AliasAnalysisKind::FROM_SCHEMA},
-    {"PURE_FUNCTION", c10::AliasAnalysisKind::PURE_FUNCTION},
-    {"", c10::AliasAnalysisKind::FROM_SCHEMA},  // default
-  };
-  auto it = key_map.find(k);
-  TORCH_CHECK(it != key_map.end(), "could not parse ", k);
-  return it->second;
-}
-
-
-template <typename Func>
-inline torch::CppFunction dispatch_str(const char* key, Func&& raw_f) {
-  auto mb_key = std::string(key) == "" ? c10::nullopt : c10::make_optional(c10::parseDispatchKey(key));
-  if (mb_key) {
-    return torch::dispatch(*mb_key, std::forward<Func>(raw_f));
-  } else {
-    torch::CppFunction f(std::forward<Func>(raw_f));
-    return f;
-  }
-}
-
-std::vector<at::Tensor> unpack(at::TensorList tl, const char *name, int pos) {
-  std::vector<at::Tensor> ret(tl.size());
-  for (const auto i : c10::irange(tl.size())) {
-    const auto &t = tl[i];
-    if (!t.defined()) {
-      continue;
-    }
-    ret[i] = static_cast<const torch::autograd::Variable&>(t);
-  }
-  return ret;
-}
-
-std::vector<Tensor> invoke_backward_fn(
-    PyObject* backward_function,
-    TensorList grads,
-    TensorList intermediates) {
-  std::vector<Tensor> result;
-
-  py::gil_scoped_acquire g;
-  auto args = py::reinterpret_steal<py::object>(PyTuple_New(grads.size() + intermediates.size()));
-  py::dict kwargs;
-  for (int64_t idx = 0; idx < (int64_t) grads.size(); idx++) {
-    PyTuple_SET_ITEM(args.ptr(), idx, torch::jit::toPyObject(grads[idx]).release().ptr());
-  }
-  for (int64_t idx = 0; idx < (int64_t) intermediates.size(); idx++) {
-    PyTuple_SET_ITEM(args.ptr(), idx, torch::jit::toPyObject(intermediates[idx + grads.size()]).release().ptr());
-  }
-
-  auto out = py::reinterpret_steal<py::object>(PyObject_Call(backward_function, args.ptr(), kwargs.ptr()));
-  if (out.ptr() == nullptr) {
-    throw python_error();
-  }
-
-  for (unsigned idx = 0; idx < grads.size(); idx++) {
-    auto ivalue = torch::jit::toIValue(PyTuple_GetItem(out.ptr(), idx), TensorType::get());
-    result.emplace_back(ivalue.toTensor());
-  }
-  return result;
-}
-
-// TODO: figure out what this is
-using torch::autograd::variable_list;
-using custom_function_t = std::vector<Tensor> (at::TensorList);
-
-void copy_range(variable_list& out, torch::autograd::IndexRange range, at::ArrayRef<Tensor> t) {
-  AT_ASSERT(range.second <= out.size());
-  std::cout << range.second << ", " << range.first << ", " << t.size() << std::endl;
-  AT_ASSERTM(range.second - range.first == t.size(), "inconsistent range for TensorList output");
-  std::copy(t.begin(), t.end(), out.begin() + range.first);
-}
-
-struct GenericPythonBackward : public torch::autograd::TraceableFunction {
-  using TraceableFunction::TraceableFunction;
-
-  variable_list apply(variable_list&& grads) override;
-  std::string name() const override { return "GenericPythonBackward"; }
-  void release_variables() override {
-    std::lock_guard<std::mutex> lock(mutex_);
-    for (auto& t : saved_tensors_) {
-      t.reset_data();
-    }
-  }
-  std::vector<torch::autograd::SavedVariable> saved_tensors_;
-  int64_t num_inputs_;
-  optional<c10::OperatorHandle> backward_fn_;
-};
-
-variable_list GenericPythonBackward::apply(variable_list&& grads) {
-  std::lock_guard<std::mutex> lock(mutex_);
-
-  torch::autograd::generated::details::IndexRangeGenerator gen;
-  auto tensors_ix = gen.range(saved_tensors_.size());
-  variable_list grad_inputs(num_inputs_);
-
-  std::vector<Tensor> args;
-  for (auto& g : grads) {
-    args.emplace_back(std::move(g));
-  }
-  for (const auto& saved : saved_tensors_) {
-    args.emplace_back(saved.unpack(shared_from_this()));
-  }
-
-  if (task_should_compute_output({ tensors_ix })) {
-    auto handle = backward_fn_->typed<custom_function_t>();
-    auto grad_result = handle.call(args);
-    grad_inputs = grad_result;
-    // copy_range(grad_inputs, tensors_ix, grad_result);
-  }
-  return grad_inputs;
-}
-
-using custom_python_function_t = TensorList (*)(TensorList);
-
-using torch::autograd::compute_requires_grad;
-using torch::autograd::collect_next_edges;
-using torch::autograd::deleteNode;
-using torch::autograd::flatten_tensor_args;
-
-void customFunctionBoxed(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  auto tensors = torch::jit::pop(stack).toTensorList().vec();
-  auto tensors_ = unpack(tensors, "tensors", 0);
-  auto _any_requires_grad = compute_requires_grad(tensors);
-  (void)_any_requires_grad;
-
-  std::string schema_name = op.schema().name();
-  std::string vjp_fn_name = schema_name + "_vjp";
-
-  std::shared_ptr<GenericPythonBackward> grad_fn;
-  if (_any_requires_grad) {
-    grad_fn = std::shared_ptr<GenericPythonBackward>(new GenericPythonBackward(), deleteNode);
-    grad_fn->set_next_edges(collect_next_edges(tensors));
-    grad_fn->backward_fn_ = c10::Dispatcher::singleton().findSchemaOrThrow(vjp_fn_name.c_str(), "");
-    grad_fn->num_inputs_ = tensors_.size();
-  }
-
-  auto typed_handle = op.typed<custom_function_t>();
-  std::vector<Tensor> _tmp = ([&]() {
-    at::AutoDispatchBelowADInplaceOrView guard;
-    return typed_handle.call(tensors_);
-  })();
-  auto result = std::move(_tmp);
-  if (grad_fn) {
-    for (auto& tensor : result) {
-      // TODO: is this right?
-      bool is_input = false;
-      for (const auto& input : tensors_) {
-        if (tensor.unsafeGetTensorImpl() == input.unsafeGetTensorImpl()) {
-          is_input = true;
-        }
-      }
-
-      if (!is_input) {
-        set_history(tensor, grad_fn);
-      }
-      grad_fn->saved_tensors_.emplace_back(tensor, !is_input);
-    }
-  }
-  torch::jit::push(stack, result);
-}
-
-void initDispatchBindings(PyObject* module) {
-  auto m = py::handle(module).cast<py::module>();
-
-  py::class_<torch::Library>(m, "_DispatchModule", py::module_local())
-    .def("def_", [](py::object self, const char* schema, const char* alias) {
-      self.cast<torch::Library&>().def(torch::schema(schema, at::functorch::parseAliasAnalysisKind(alias)));
-      return self;
-    }, "", py::arg("schema"), py::arg("alias") = "")
-    .def("impl", [](py::object self, const char* name, const char* dispatch, py::object func) {
-      self.cast<torch::Library&>().impl(
-        name,
-        dispatch_str(dispatch, torch::CppFunction::makeFromBoxedFunctor(std::make_unique<at::functorch::PythonKernelHolder>(std::move(func))))
-      );
-    }, "", py::arg("name"), py::arg("dispatch"), py::arg("func"))
-    .def("gen_backward_binding", [](py::object self, const char* name, const char* dispatch) {
-      self.cast<torch::Library&>().impl(
-        name,
-        dispatch_str(dispatch,
-          torch::CppFunction::makeFromBoxedFunction<&customFunctionBoxed>())
-      );
-    }, "", py::arg("name"), py::arg("dispatch"))
-    .def("fallback_fallthrough", [](py::object self, const char* dispatch) {
-      self.cast<torch::Library&>().fallback(
-        dispatch_str(dispatch, torch::CppFunction::makeFallthrough())
-      );
-      return self;
-    }, "", py::arg("dispatch") = "")
-  ;
-
-  m.def("_dispatch_library", [](const char* kind, std::string name, const char* dispatch) {
-    auto mb_key = std::string(dispatch) == "" ? c10::nullopt : c10::make_optional(c10::parseDispatchKey(dispatch)      );
-    return std::make_unique<torch::Library>(parseKind(kind), std::move(name), mb_key, "/dev/null", 0);
-  });
-}
-
-
-}} // at::functorch
-#endif // #ifndef _WIN32
diff --git a/functorch/functorch/csrc/CustomFunction.h b/functorch/functorch/csrc/CustomFunction.h
deleted file mode 100644
index f9ef44faacb8..000000000000
--- a/functorch/functorch/csrc/CustomFunction.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#pragma once
-
-#ifndef _WIN32
-#include <torch/extension.h>
-#include <torch/library.h>
-#include <torch/csrc/jit/python/pybind_utils.h>
-#include <torch/csrc/autograd/python_variable.h>
-
-namespace at { namespace functorch {
-
-void initDispatchBindings(PyObject* module);
-
-}}
-#endif // #ifndef _WIN32
diff --git a/functorch/functorch/experimental/ops.py b/functorch/functorch/experimental/ops.py
deleted file mode 100644
index 5975b3b7eed1..000000000000
--- a/functorch/functorch/experimental/ops.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from torch._dispatch._dispatcher import PyDispatcher, to_flat_tuple, compute_keyset
-from torch.nn.functional import handle_torch_function
-from torch.overrides import has_torch_function
-import torch._C as _C
-
-class PyOperator:
-    def __init__(self, name):
-        self.name = name
-        self.table = {}
-
-        self.__name__ = name
-
-    def impl(self, dispatch_key, fn):
-        assert dispatch_key not in self.table
-        if fn is fallthrough_fn:
-            self.table[dispatch_key] = fn(self, dispatch_key)
-        else:
-            self.table[dispatch_key] = fn
-
-    def lookup(self, keyset):
-        dispatch_key = keyset.highestPriorityTypeId()
-        return self.table[dispatch_key]
-
-    def __call__(self, *args, **kwargs):
-        flat_args = to_flat_tuple(args, kwargs)
-        if has_torch_function(flat_args):
-            return handle_torch_function(self, flat_args, *args, **kwargs)
-
-        return PyDispatcher.call(self, *args, **kwargs)
-
-def fallthrough_fn(operator, dispatch_key):
-    def inner(*args, **kwargs):
-        all_keys_after_current = _C._dispatch_keyset_full_after(dispatch_key)
-        all_keys_after_current_masked = all_keys_after_current & compute_keyset(args, kwargs)
-        return PyDispatcher.redispatch(operator, all_keys_after_current_masked, *args, **kwargs)
-    return inner
diff --git a/functorch/pull_request_template.md b/functorch/pull_request_template.md
deleted file mode 100644
index abb0f9bfe518..000000000000
--- a/functorch/pull_request_template.md
+++ /dev/null
@@ -1,5 +0,0 @@
-To contribute a change to functorch, please make sure you are submitting a
-Pull Request to the functorch folder in https://github.com/pytorch/pytorch
-repository. The source of truth for functorch has moved there from
-https://github.com/pytorch/functorch ; the pytorch/functorch repository
-is now read-only.
diff --git a/functorch/setup.cfg b/functorch/setup.cfg
deleted file mode 100644
index c2f3b448a6c2..000000000000
--- a/functorch/setup.cfg
+++ /dev/null
@@ -1,18 +0,0 @@
-[bdist_wheel]
-universal=1
-
-[metadata]
-license_file = LICENSE
-
-[pep8]
-max-line-length = 120
-
-[flake8]
-max-line-length = 120
-exclude = docs, benchmarks, notebooks, tools
-per-file-ignores =
-    __init__.py: F401
-    functorch/_src/decompositions.py: E501
-
-[pydocstyle]
-select = D417 # Missing argument descriptions in the docstring
diff --git a/functorch/setup.py b/functorch/setup.py
index e200cbd1fcc5..40c458a7b099 100644
--- a/functorch/setup.py
+++ b/functorch/setup.py
@@ -3,17 +3,12 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# This is a dummy setup.py that does not do anything
 
-import distutils.command.clean
-import shutil
-import glob
 import os
 import subprocess
-from setuptools import setup, find_packages
-from torch.utils.cpp_extension import (
-    CppExtension,
-    BuildExtension,
-)
+from setuptools import setup
+import warnings
 
 cwd = os.path.dirname(os.path.abspath(__file__))
 version_txt = os.path.join(cwd, 'version.txt')
@@ -32,16 +27,6 @@
     version += '+' + sha[:7]
 
 
-def write_version_file():
-    version_path = os.path.join(cwd, 'functorch', 'version.py')
-    with open(version_path, 'w') as f:
-        f.write("__version__ = '{}'\n".format(version))
-        f.write("git_version = {}\n".format(repr(sha)))
-
-
-# pytorch_dep = 'torch'
-# if os.getenv('PYTORCH_VERSION'):
-#     pytorch_dep += "==" + os.getenv('PYTORCH_VERSION')
 requirements = [
     # This represents a nightly version of PyTorch.
     # It can be installed as a binary or from source.
@@ -52,98 +37,29 @@ def write_version_file():
 extras["aot"] = ["networkx", ]
 
 
-class clean(distutils.command.clean.clean):
-    def run(self):
-
-        with open(".gitignore", "r") as f:
-            ignores = f.read()
-            for wildcard in filter(None, ignores.split("\n")):
-                for filename in glob.glob(wildcard):
-                    try:
-                        os.remove(filename)
-                    except OSError:
-                        shutil.rmtree(filename, ignore_errors=True)
-
-        # It's an old-style class in Python 2.7...
-        distutils.command.clean.clean.run(self)
-
-
-def get_extensions():
-    extension = CppExtension
-
-    # See functorch/csrc/Macros.h
-    define_macros = [('FUNCTORCH_BUILD_MAIN_LIB', None)]
-
-    extra_link_args = []
-    extra_compile_args = {"cxx": [
-        "-O3",
-        "-std=c++14",
-        "-fdiagnostics-color=always",
-    ]}
-    debug_mode = os.getenv('DEBUG', '0') == '1'
-    if debug_mode:
-        print("Compiling in debug mode")
-        extra_compile_args = {
-            "cxx": [
-                "-O0",
-                "-fno-inline",
-                "-g",
-                "-std=c++14",
-                "-fdiagnostics-color=always",
-            ]}
-        extra_link_args = ["-O0", "-g"]
-
-    this_dir = os.path.dirname(os.path.abspath(__file__))
-    extensions_dir = os.path.join(this_dir, "functorch", "csrc")
-
-    extension_sources = set(
-        os.path.join(extensions_dir, p)
-        for p in glob.glob(os.path.join(extensions_dir, "*.cpp"))
-    )
-    sources = list(extension_sources)
-    sources.append(os.path.join(extensions_dir, "dim", "dim.cpp"))
-
-    ext_modules = [
-        extension(
-            "functorch._C",
-            sources,
-            include_dirs=[this_dir],
-            define_macros=define_macros,
-            extra_compile_args=extra_compile_args,
-            extra_link_args=extra_link_args,
-        )
-    ]
-
-    return ext_modules
-
-
-class BuildExtension_(BuildExtension):
-    def build_extensions(self, *args, **kwargs):
-        # It turns out for windows this isn't populated?
-        if hasattr(self.compiler, 'compiler_so'):
-            if '-Wstrict-prototypes' in self.compiler.compiler_so:
-                self.compiler.compiler_so.remove('-Wstrict-prototypes')
-        super().build_extensions(*args, **kwargs)
-
-
 if __name__ == '__main__':
-    print("Building wheel {}-{}".format(package_name, version))
-    write_version_file()
-    setup(
-        # Metadata
-        name=package_name,
-        version=version,
-        author='PyTorch Core Team',
-        url="https://github.com/pytorch/functorch",
-        description='JAX-like composable function transforms for PyTorch',
-        license='BSD',
-
-        # Package info
-        packages=find_packages(),
-        install_requires=requirements,
-        extras_require=extras,
-        ext_modules=get_extensions(),
-        cmdclass={
-            "build_ext": BuildExtension_.with_options(no_python_abi_suffix=True),
-            'clean': clean,
-        })
+    try:
+        setup(
+            # Metadata
+            name=package_name,
+            version=version,
+            author='PyTorch Core Team',
+            url="https://github.com/pytorch/functorch",
+            description='JAX-like composable function transforms for PyTorch',
+            license='BSD',
+
+            # Package info
+            packages=[],
+            install_requires=requirements,
+            extras_require=extras,
+        )
+    except Exception as e:
+        print(e, file=sys.stderr)
+        sys.exit(1)
+
+    warnings.warn(
+        'Installing PyTorch from source or from a nightly binary already '
+        'installs functorch (as of 9/14/2022), so there is no need to cd '
+        'into functorch and run `python setup.py {install, develop}` anymore. '
+        'We will soon remove this method of installing functorch.',
+        DeprecationWarning)
diff --git a/functorch/test/test_control_flow.py b/functorch/test/test_control_flow.py
index d34a01032197..0f4682c2594c 100644
--- a/functorch/test/test_control_flow.py
+++ b/functorch/test/test_control_flow.py
@@ -19,7 +19,7 @@ def false_fn(x):
 
 
 class TestControlFlowTraced(TestCase):
-    def test_cond_traced(self):
+    def test_cond_traced_not_nested(self):
         def true_fn(x):
             return x.sin()
 
@@ -126,14 +126,14 @@ def f(x, pred, pred2):
         def forward(self, x_1, pred_1, pred2_1):
             true_graph_0 = self.true_graph_0
             false_graph_0 = self.false_graph_0
-            conditional = functorch_experimental_ops_cond(pred_1,
-            true_graph_0, false_graph_0, [[x_1]]);  pred_1 = true_graph_0 = false_graph_0 = None
+            conditional = torch.ops.cond(pred_1, true_graph_0, false_graph_0, [[x_1]]);
+            pred_1 = true_graph_0 = false_graph_0 = None
             true_graph_1 = self.true_graph_1
             false_graph_1 = self.false_graph_1
-            conditional_1 = functorch_experimental_ops_cond(pred2_1,
-            true_graph_1, false_graph_1, [[x_1, x_1]]);  pred2_1 = true_graph_1 = false_graph_1 = x_1 = None
-            add_tensor = torch.ops.aten.add.Tensor(conditional, conditional_1);  conditional = conditional_1 = None
-            return add_tensor
+            conditional_1 = torch.ops.cond(pred2_1, true_graph_1, false_graph_1, [[x_1, x_1]]);
+            pred2_1 = true_graph_1 = false_graph_1 = x_1 = None
+            add = torch.ops.aten.add.Tensor(conditional, conditional_1);  conditional = conditional_1 = None
+            return add
         """
         code = graph.code
         # Normalization hack, cause .code makes some weird whitespace
@@ -145,8 +145,8 @@ def forward(self, x_1, pred_1, pred2_1):
         out = """
         def forward(self, flat_args):
             flat_args_1, = fx_pytree.tree_flatten_spec([flat_args], self._in_spec)
-            mul_tensor = torch.ops.aten.mul.Tensor(flat_args_1, flat_args_1);  flat_args_1 = None
-            return pytree.tree_unflatten([mul_tensor], self._out_spec)
+            mul = torch.ops.aten.mul.Tensor(flat_args_1, flat_args_1);  flat_args_1 = None
+            return pytree.tree_unflatten([mul], self._out_spec)
         """
         # Normalization hack, cause .code makes some weird whitespace
         code = "".join(code.split())
diff --git a/functorch/test/test_eager_transforms.py b/functorch/test/test_eager_transforms.py
index d8db032a9143..3b6fed5749e4 100644
--- a/functorch/test/test_eager_transforms.py
+++ b/functorch/test/test_eager_transforms.py
@@ -18,7 +18,6 @@
 import math
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, onlyCPU
 from torch.testing._internal.common_dtype import get_all_fp_dtypes
-from torch.testing._internal.common_utils import IS_WINDOWS
 from torch._subclasses.fake_tensor import FakeTensorMode
 from functools import partial
 from functorch.experimental import replace_all_batch_norm_modules_
@@ -32,12 +31,9 @@
 from functorch._src.make_functional import (
     functional_init, functional_init_with_buffers,
 )
-from functorch._src.eager_transforms import _argnums_partial, enable_fwd_grad
+from functorch._src.eager_transforms import enable_fwd_grad, _slice_argnums
 from functorch.experimental import functionalize
 
-if not IS_WINDOWS:
-    from functorch._src.custom_function import custom_vjp
-
 # NB: numpy is a testing dependency!
 import numpy as np
 
@@ -51,145 +47,96 @@
                   "`--no-deps` to avoid overwriting the pytorch installation",
                   UserWarning)
 
-# TestCase for _argnums_partial, an important helper funciton
+# TestCase for _slice_argnums, an important helper funciton
 
 
-class TestArgnumsPartial(TestCase):
+class TestSliceArgnums(TestCase):
     def test_invalid_argnum_type(self):
         x = torch.randn(3)
         args = (x,)
         with self.assertRaisesRegex(RuntimeError, "int or Tuple"):
-            _argnums_partial(torch.sin, args, 0.0)
+            _slice_argnums(args, 0.0)
         with self.assertRaisesRegex(RuntimeError, "int or Tuple"):
-            _argnums_partial(torch.sin, args, [0])
+            _slice_argnums(args, [0])
         with self.assertRaisesRegex(RuntimeError, "must be int"):
-            _argnums_partial(torch.sin, args, (0.0,))
+            _slice_argnums(args, (0.0,))
 
         args = (0.1, 1.1, 2.1, 3.1, 4.1)
 
-        def f(a, b, c, d, e):
-            return a
         with self.assertRaisesRegex(RuntimeError, "must be int"):
-            _argnums_partial(torch.sin, args, ((0, 1), 2))
+            _slice_argnums(args, ((0, 1), 2))
 
     def test_out_of_bounds_argnum_values(self):
         x = torch.randn(3)
         args = (x,)
         with self.assertRaisesRegex(RuntimeError, "positional inputs"):
-            _argnums_partial(torch.sin, args, 1)
+            _slice_argnums(args, 1)
         with self.assertRaisesRegex(RuntimeError, "positional inputs"):
-            _argnums_partial(torch.sin, args, -2)
+            _slice_argnums(args, -2)
         with self.assertRaisesRegex(RuntimeError, "positional inputs"):
-            _argnums_partial(torch.sin, args, (-2,))
+            _slice_argnums(args, (-2,))
 
     def test_not_enough_argnums(self):
         x = torch.randn(3)
         args = (x,)
         with self.assertRaisesRegex(RuntimeError, "must be non-empty"):
-            _argnums_partial(torch.sin, args, ())
+            _slice_argnums(args, ())
 
     def test_duplicate_argnums(self):
         x = torch.randn(3)
         args = (x, x)
         with self.assertRaisesRegex(RuntimeError, "must be unique"):
-            _argnums_partial(torch.add, args, (0, 0))
+            _slice_argnums(args, (0, 0))
         with self.assertRaisesRegex(RuntimeError, "must be unique"):
-            _argnums_partial(torch.add, args, (0, -2))
+            _slice_argnums(args, (0, -2))
 
     def test_flat_args_with_positive_int_argnum(self):
         args = (0.1, 1.1, 2.1, 3.1, 4.1)
 
-        def f(a, b, c, d, e):
-            return a
-
-        f_new, res = _argnums_partial(f, args, 0)
+        res = _slice_argnums(args, 0)
         self.assertEqual(res, (0.1,))
-        self.assertEqual(f_new(*res), 0.1)
 
-        f_new, res = _argnums_partial(f, args, 4)
+        res = _slice_argnums(args, 4)
         self.assertEqual(res, (4.1,))
-        self.assertEqual(f_new(*res), 0.1)
 
     def test_flat_args_with_negative_int_argnum(self):
         args = (0.1, 1.1, 2.1, 3.1, 4.1)
 
-        def f(a, b, c, d, e):
-            return a
-
-        expected = f(*args)
-        f_new, res = _argnums_partial(f, args, -1)
+        res = _slice_argnums(args, -1)
         self.assertEqual(res, (4.1,))
-        self.assertEqual(f_new(*res), expected)
 
-        f_new, res = _argnums_partial(f, args, -5)
+        res = _slice_argnums(args, -5)
         self.assertEqual(res, (0.1,))
-        self.assertEqual(f_new(*res), expected)
 
     def test_flat_args_with_tuple_argnum(self):
         args = (0.1, 1.1, 2.1, 3.1, 4.1)
 
-        def f(a, b, c, d, e):
-            return a
-
-        f_new, res = _argnums_partial(f, args, (0, 1, 2, 3, 4))
-        self.assertEqual(f_new(*res), 0.1)
+        res = _slice_argnums(args, (0, 1, 2, 3, 4))
         self.assertEqual(res, args)
 
-        f_new, res = _argnums_partial(f, args, (0, -3))
-        self.assertEqual(f_new(*res), 0.1)
+        res = _slice_argnums(args, (0, -3))
         self.assertEqual(res, (0.1, 2.1))
 
     def test_pytree_args(self):
         args = ((0.1, 1.1), 2.0, [3.1])
 
-        def f(a, b, c):
-            return a[0] + a[1] + b + c[0]
-
-        expected = f(*args)
-
-        f_new, res = _argnums_partial(f, args, 0)
+        res = _slice_argnums(args, 0)
         self.assertEqual(res, args[0:1])
-        self.assertEqual(f_new(*res), expected)
 
-        f_new, res = _argnums_partial(f, args, (0,))
+        res = _slice_argnums(args, (0,))
         self.assertEqual(res, args[0:1])
-        self.assertEqual(f_new(*res), expected)
 
-        f_new, res = _argnums_partial(f, args, -1)
+        res = _slice_argnums(args, -1)
         self.assertEqual(res, args[-1:])
-        self.assertEqual(f_new(*res), expected)
 
-        f_new, res = _argnums_partial(f, args, (0, -2))
+        res = _slice_argnums(args, (0, -2))
         self.assertEqual(res, args[0:2])
-        self.assertEqual(f_new(*res), expected)
 
     def test_argnums_reorders(self):
         args = ((0.1, 1.1, 2.1), 3.1, 4.1)
 
-        def f(a, b, c):
-            return a[0] + a[1] + a[2] + b + c
-
-        expected = f(*args)
-        f_new, res = _argnums_partial(f, args, (1, 0))
+        res = _slice_argnums(args, (1, 0))
         self.assertEqual(res, (args[1], args[0]))
-        self.assertEqual(f_new(*res), expected)
-
-    def test_function_with_default_args(self):
-        args = ((0.1, 1.1, 2.1), 3.1)
-
-        def f(a, b, c=4.1):
-            return a[0] + a[1] + a[2] + b + c
-
-        expected = f(*args)
-        f_new, res = _argnums_partial(f, args, -2)
-        self.assertEqual(res, args[0:1])
-        self.assertEqual(f_new(*res), expected)
-
-        args = ((0.1, 1.1, 2.1), 3.1, 5.1)
-        expected = f(*args)
-        f_new, res = _argnums_partial(f, args, -1)
-        self.assertEqual(res, args[-1:])
-        self.assertEqual(f_new(*res), expected)
 
 
 class TestGradTransform(TestCase):
@@ -1596,6 +1543,40 @@ def f(x, y):
         y = torch.randn(3)
         self._test_against_reference(f, (x, y), jacapi)
 
+    @jacrev_and_jacfwd
+    def test_against_reference_default_arg(self, device, jacapi):
+        def f(x, y, z=3.):
+            return x * y * z
+
+        x = torch.randn(3, device=device)
+        y = torch.randn(3, device=device)
+        self._test_against_reference(f, (x, y), jacapi)
+
+    @jacrev_and_jacfwd
+    def test_inplace(self, device, jacapi):
+        def f(x, y):
+            y.copy_(x)
+            return y
+
+        out = jacapi(f, argnums=0)  # x is differentiable
+        x, y = torch.randn(2, device=device), torch.randn(2, device=device)
+        self.assertEqual(out(x, y), torch.eye(y.shape[0]))
+
+        # testing tuple of argnums with the example that raised this issue originally
+        def g(x, y, z):
+            x[:2] = y
+            return torch.vstack([(x**2).sum(), (z**3).sum()])
+
+        out = jacapi(g, argnums=(1, 2))
+        x, y, z = torch.randn(3, device=device), torch.randn(2, device=device), torch.randn(2, device=device)
+
+        expected_out = (torch.zeros(2, 1, 2, device=device), torch.zeros(2, 1, 2, device=device))
+        expected_out[0][0][0] = 2 * y  # top left corner
+        expected_out[1][1][0] = 3 * (z ** 2)  # bottom right corner
+
+        out_val = out(x, y, z)
+        self.assertEqual(out_val, expected_out)
+
 
 class TestHessian(TestCase):
     def _test_against_reference(self, f, inputs):
@@ -2171,41 +2152,6 @@ def push_jvp(x, y, yt):
         self.assertEqual(tangents[1], yt.movedim(2, 0))
 
 
-class TestCustomFunction(TestCase):
-    @unittest.skipIf(IS_WINDOWS, "Prototype of custom_vjp doesn't link on windows")
-    @onlyCPU
-    def test_basic(self, device):
-        called_impl = False
-        called_vjp = False
-
-        def my_sin_impl(args):
-            x, = args
-            nonlocal called_impl
-            called_impl = True
-            return x.sin(), x
-
-        def my_sin_vjp(args):
-            grad_y, result, x = args
-            nonlocal called_vjp
-            called_vjp = True
-            return (grad_y * 3 * x.cos(),)
-
-        def filter_fn(args):
-            return args[0]
-
-        my_sin = custom_vjp('my_sin', filter_fn, my_sin_impl, my_sin_vjp)
-
-        x = torch.tensor([1., 2.], requires_grad=True, device=device)
-
-        y = my_sin(x)
-        self.assertTrue(called_impl)
-
-        y.sum().backward()
-        self.assertTrue(called_vjp)
-
-        assert torch.allclose(x.grad, 3 * x.cos())
-
-
 class TestComposability(TestCase):
     def test_grad_grad(self, device):
         x = torch.randn([], device=device)
@@ -3470,11 +3416,6 @@ def forward(self, x_1):
     globals(),
     only_for=only_for,
 )
-instantiate_device_type_tests(
-    TestCustomFunction,
-    globals(),
-    only_for=only_for,
-)
 instantiate_device_type_tests(
     TestFunctionalize,
     globals(),
diff --git a/functorch/test/test_ops.py b/functorch/test/test_ops.py
index d011567d5bae..3175cc78ffb1 100644
--- a/functorch/test/test_ops.py
+++ b/functorch/test/test_ops.py
@@ -288,7 +288,6 @@ def is_inplace(op, variant):
 vjp_fail = {
     xfail('tensor_split'),  # data_ptr composite compliance
     xfail('nn.functional.ctc_loss'),  # data_ptr composite compliance
-    xfail('to_sparse'),
 }
 
 
@@ -299,6 +298,7 @@ class TestOperators(TestCase):
         xfail('chalf', '', device_type='cpu'),  # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
         skip('as_strided_scatter', ''),  # silent incorrectness; seems flaky
         xfail('sparse.sampled_addmm', ''),  # RuntimeError: Sparse CSR tensors do not have strides
+        xfail('to_sparse', ''),  # Could not run 'aten::sum.dim_IntList'
     }))
     @opsToleranceOverride('TestOperators', 'test_grad', (
         tol1('nn.functional.binary_cross_entropy_with_logits',
@@ -513,17 +513,99 @@ def fn(inp, *args, **kwargs):
                 return op.inplace_variant(inp.clone(), *args, **kwargs)
             test(fn, inplace=True)
 
+    @skipOps('TestOperators', 'test_vmapvjpvjp', vjp_fail.union({
+        skip("atleast_1d"),  # Takes too long
+        skip("atleast_2d"),  # Takes too long
+        skip("atleast_3d"),  # Takes too long
+        xfail("_masked.prod"),  # calls item
+        xfail("_masked.cumprod"),  # calls item
+        xfail("as_strided"),  # incorrect output
+        xfail("as_strided_scatter"),  # incorrect output
+        skip("bernoulli"),  # calls random op
+        xfail("bfloat16"),  # rank 4 tensor for channels_last
+        xfail("chalf"),  # rank 4 tensor for channels_last
+        xfail("cumprod"),  # calls item
+        xfail("double"),  # rank 4 tensor for channels_last
+        xfail("float"),  # rank 4 tensor for channels_last
+        xfail("half"),  # rank 4 tensor for channels_last
+        # It looks like you're either (1) calling .item() on a Tensor or
+        # (2) attempting to use a Tensor in some data-dependent control flow or
+        # (3) encountering this error in PyTorch internals.
+        xfail("index_reduce"),
+        xfail("linalg.eig"),  # vmap over torch.allclose
+        xfail("linalg.eigvals"),  # vmap over torch.allclose
+        xfail("linalg.householder_product"),  # vmap: inplace into a regular tensor
+        # It looks like you're either (1) calling .item() on a Tensor or
+        # (2) attempting to use a Tensor in some data-dependent control flow or
+        # (3) encountering this error in PyTorch internals.
+        xfail("linalg.vander"),
+        xfail("nanquantile", device_type='cpu'),  # vmap not implemented for at::equal.
+        xfail("native_layer_norm"),  # vmap: inplace into a regular tensor
+        # got a batched tensor as input while the running_mean or running_var,
+        # which will be updated in place, were not batched.
+        xfail("nn.functional.batch_norm"),
+        xfail("nn.functional.binary_cross_entropy"),  # vmap: inplace into a regular tensor
+        skip("nn.functional.dropout"),  # calls random op
+        skip("nn.functional.dropout2d"),  # calls random op
+        skip("nn.functional.dropout3d"),  # calls random op
+        skip("nn.functional.feature_alpha_dropout", "with_train"),  # calls random op
+        skip("nn.functional.fractional_max_pool2d"),  # calls random op
+        skip("nn.functional.fractional_max_pool3d"),  # calls random op
+        # It looks like you're either (1) calling .item() on a Tensor or
+        # (2) attempting to use a Tensor in some data-dependent control flow or
+        # (3) encountering this error in PyTorch internals.
+        xfail("nn.functional.gaussian_nll_loss"),
+        # got a batched tensor as input while the running_mean or running_var,
+        # which will be updated in place, were not batched.
+        xfail("nn.functional.instance_norm"),
+        xfail("nn.functional.layer_norm"),  # vmap: inplace into a regular tensor
+        # RuntimeError: NYI: querying is_contiguous inside of vmap
+        # for memory_format other than torch.contiguous_formats
+        xfail("nn.functional.max_pool2d"),
+        # RuntimeError: NYI: Tensor.clone(memory_format) inside vmap is only
+        # supported with memory_format torch.preserve_format or
+        # torch.contiguous_format (got ChannelsLast)
+        xfail("nn.functional.max_unpool2d"),
+        # RuntimeError: NYI: Tensor.clone(memory_format) inside vmap is only
+        # supported with memory_format torch.preserve_format
+        # or torch.contiguous_format (got ChannelsLast)s
+        xfail("nn.functional.max_unpool2d", "grad"),
+        xfail("nn.functional.rrelu"),  # RuntimeError: vmap: we do not yet support aten::rrelu_with_noise.
+        xfail("normal"),  # calls random op
+        xfail("normal", "number_mean"),  # calls random op
+        xfail("pca_lowrank"),  # calls random op
+        # (calls nonzero): vmap: We do not support batching operators that can output dynamic shape.
+        xfail("prod"),
+        xfail("put"),  # vmap: inplace into a regular tensor
+        xfail("quantile", device_type='cpu'),  # Batching rule not implemented for `at::equal`
+        xfail("scatter_reduce", "prod"),  # vmap (looks like you are calling item/data-dependent)
+        xfail("sparse.sampled_addmm"),  # RuntimeError: Sparse CSR tensors do not have strides
+        xfail("svd_lowrank"),  # calls random op
+        xfail("take"),  # vmap: inplace into a regular tensor
+        xfail("to"),  # rank 4 tensor for channels_last
+        xfail("view_as_complex"),  # RuntimeError: Tensor must have a last dimension with stride 1
+        xfail("_masked.softmax", device_type='cuda'),  # Mismatch in values!
+        xfail("_masked.softmin", device_type='cuda'),  # Mismatch in values!
+        # got a batched tensor as input while the running_mean or running_var,
+        # which will be updated in place, were not batched.
+        xfail("nn.functional.batch_norm", 'without_cudnn'),
+        # view doesn't work on sparse
+        xfail("to_sparse"),
+    }))
     @ops(op_db + additional_op_db, allowed_dtypes=(torch.float,))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
+    @opsToleranceOverride('TestOperators', 'test_vmapvjpvjp', (
+        tol1('linalg.svd',
+             {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
+        tol1('linalg.lu_factor',
+             {torch.float32: tol(atol=2e-03, rtol=2e-02)}),
+        tol1('svd',
+             {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
+    ))
     def test_vmapvjpvjp(self, device, dtype, op):
-        self.skipTest("Skipped; these tests take too long")
-        op_skip = set({
-        })
-        op_skip = op_skip.union(vjp_fail)
-        if op.name in op_skip:
-            self.skipTest("Skipped; Expected failures")
-            return
-
+        # Since, we test `vjpvjp` independently,
+        # for this test, we just verify that vmap
+        # of `vjpvjp` is correct.
         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
             return
@@ -583,6 +665,7 @@ def vjp_of_vjp(*args_and_cotangents):
         xfail('take'),  # dynamic
         xfail('pca_lowrank', ''),  # randomness
         xfail('svd_lowrank', ''),  # randomness
+        xfail('to_sparse', ''),  # non-dense output
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
         # ----------------------------------------------------------------------
 
@@ -591,11 +674,11 @@ def vjp_of_vjp(*args_and_cotangents):
         skip('linalg.svdvals'),  # # really annoying thing where it passes correctness check but not has_batch_rule
         xfail('__getitem__', ''),  # dynamic error
         xfail('_masked.prod'),  # calls aten::item
-        xfail('eig'),  # calls aten::item
         xfail('linalg.eig'),  # Uses aten::allclose
         xfail('linalg.householder_product'),  # needs select_scatter
         xfail('nanquantile', device_type='cpu'),  # checks q via a .item() call
         xfail('nn.functional.gaussian_nll_loss'),  # checks var for if any value < 0
+        xfail('narrow'),  # .item() call
         xfail('prod'),  # calls nonzero
         xfail('quantile', device_type='cpu'),  # checks q via a .item() call
         xfail('view_as_complex'),  # Tensor must have a last dimension with stride 1
@@ -742,10 +825,10 @@ def test_vmapjvpall(self, device, dtype, op):
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
         xfail('nn.functional.huber_loss'),
         xfail('lu'),
-        skip('linalg.det', 'singular'),  # https://github.com/pytorch/functorch/issues/961
+        xfail('linalg.det'),
+        xfail('linalg.det', 'singular'),
         xfail('cumprod'),
         xfail('lu_solve'),
-        xfail('linalg.det'),
         xfail('masked_fill'),
         xfail('copysign'),
         xfail('linalg.solve'),
@@ -778,6 +861,10 @@ def test_vmapjvpall(self, device, dtype, op):
         xfail('nn.functional.soft_margin_loss', ''),
         xfail('nn.functional.max_unpool1d', 'grad'),
         xfail('nn.functional.embedding', ''),
+        xfail('scatter_reduce', "sum"),   # aten::scatter_reduce.two hit the vmap fallback
+        xfail('scatter_reduce', "mean"),  # aten::scatter_reduce.two hit the vmap fallback
+        xfail('scatter_reduce', "amin"),  # aten::scatter_reduce.two hit the vmap fallback
+        xfail('scatter_reduce', "amax"),  # aten::scatter_reduce.two hit the vmap fallback
         xfail('lu_unpack'),
         xfail('nn.functional.glu'),
         xfail('nn.functional.bilinear'),  # trilinear doesn't have batching rule
@@ -826,13 +913,12 @@ def test():
         xfail('cummax'),
         xfail('cummin'),
         xfail('cumprod'),
-        xfail('eig'),
         xfail('nansum'),
         xfail('nanmean'),
+        xfail('narrow'),  # Batching rule not implemented for `narrow.Tensor` (and view op)
         xfail('special.log_ndtr'),
         xfail('index_copy'),
         xfail('index_fill'),
-        xfail('linalg.det'),
         xfail('linalg.eig'),
         xfail('linalg.householder_product'),
         xfail('lu'),
@@ -844,7 +930,10 @@ def test():
         xfail('nanquantile'),
         xfail('prod'),
         xfail('put'),
-        skip('linalg.det'),  # https://github.com/pytorch/functorch/issues/961
+        xfail('scatter_reduce', "sum"),   # aten::scatter_reduce.two hit the vmap fallback
+        xfail('scatter_reduce', "mean"),  # aten::scatter_reduce.two hit the vmap fallback
+        xfail('scatter_reduce', "amin"),  # aten::scatter_reduce.two hit the vmap fallback
+        xfail('scatter_reduce', "amax"),  # aten::scatter_reduce.two hit the vmap fallback
         xfail('quantile'),
         xfail('renorm'),
         xfail('take'),
@@ -881,14 +970,10 @@ def test():
         xfail('nn.functional.pdist', ''),
         xfail('nn.functional.smooth_l1_loss', ''),
         xfail('scatter_reduce', 'prod'),
-        xfail('scatter_reduce', 'amax'),
         xfail('nn.functional.max_unpool1d', ''),
         xfail('nn.functional.max_unpool3d', ''),
-        xfail('scatter_reduce', 'sum'),
-        xfail('scatter_reduce', 'mean'),
         xfail('nn.functional.max_unpool3d', 'grad'),
         xfail('nn.functional.soft_margin_loss', ''),
-        xfail('scatter_reduce', 'amin'),
         xfail('nn.functional.max_unpool1d', 'grad'),
         xfail('nn.functional.max_unpool2d', 'grad'),
         xfail('linalg.lu', ''),
@@ -941,6 +1026,7 @@ def test():
         skip('nn.functional.feature_alpha_dropout', 'with_train'),  # randomness
         skip('nn.functional.feature_alpha_dropout', 'without_train'),  # randomness
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
+        skip('to_sparse', ''),  # non-dense output
 
         # fallback path doesn't work
         # All of the following are bugs and need to be fixed
@@ -949,6 +1035,7 @@ def test():
         xfail('view_as_complex'),
         xfail('nn.functional.gaussian_nll_loss'),
         xfail('masked_select'),
+        xfail('narrow'),  # Batching rule not implemented for `narrow.Tensor` (and view op)
         skip('nn.functional.fractional_max_pool3d'),  # generator works on cpu, fails on cuda
         xfail('__rpow__'),  # https://github.com/pytorch/functorch/issues/617
         skip('nn.functional.fractional_max_pool2d'),  # generator works on cpu, fails on cuda
@@ -1035,38 +1122,26 @@ def get_vjp(cotangents, *primals):
 
     @ops(op_db + additional_op_db, allowed_dtypes=(torch.float,))
     @skipOps('TestOperators', 'test_jvpvjp', vjp_fail.union({
+        xfail('to_sparse', ''),  # NYI
         # RuntimeError: Trying to set a forward gradient that has a different size than that of the original Tensor,
         # this is not supported. Tensor is of size [5, 2, 3] while the given forward gradient is of size [1, 2, 3].
         xfail('normal', ''),
-        xfail('_masked.log_softmax', ''),  # NYI: forward-AD for _log_softmax_backward_data
-        xfail('_masked.softmax', ''),  # NYI: forward-AD for _softmax_backward_data
-        xfail('_masked.softmin', ''),  # NYI: forward-AD for _softmax_backward_data
         xfail('cdist', ''),  # NYI: forward-AD for _cdist_forward
         xfail('cholesky', ''),  # NYI: forward-AD for cholesky
-        xfail('eig', ''),  # NYI: forward-AD for eig
         xfail('logcumsumexp', ''),  # NYI: forward-AD for logcumsumexp
         xfail('nn.functional.embedding_bag', ''),  # NYI: forward-AD for _embedding_bag
         xfail('nn.functional.grid_sample', ''),  # NYI: forward AD for grid_sampler_2d
         xfail('nn.functional.hardsigmoid', ''),  # NYI: forward AD for hardsigmoid_backward
         xfail('nn.functional.huber_loss', ''),  # NYI: forward AD for huber_loss_backward
-        xfail('nn.functional.instance_norm', ''),  # NYI: forward AD for native_batch_norm_backward
         xfail('nn.functional.logsigmoid', ''),  # not differentiable w.r.t. buffer
-        xfail('nn.functional.softmin', ''),  # NYI: forward-AD for _softmax_backward_data
-        xfail('nn.functional.softmin', 'with_dtype'),  # NYI: forward-AD for _softmax_backward_data
         xfail('renorm', ''),  # NYI: forward AD for renorm
         xfail('symeig', ''),  # NYI: forward AD for symeig
         xfail('nn.functional.multilabel_margin_loss', ''),  # NYI: multilabel_margin_loss_forward
         xfail('nn.functional.multilabel_soft_margin_loss', ''),  # NYI: log_sigmoid_backward
-        xfail('scatter_reduce', 'amax'),  # NYI: forward-AD for scatter_reduce
-        xfail('scatter_reduce', 'amin'),  # NYI: forward-AD for scatter_reduce
         xfail('nn.functional.soft_margin_loss', ''),  # NYI: forward-AD for log_sigmoid_backward
         xfail('nn.functional.pdist', ''),  # NYI: forward-AD with _pdist_forward
-        xfail('scatter_reduce', 'sum'),  # NYI: forward-AD for scatter_reduce
         xfail('nn.functional.multi_margin_loss', ''),  # NYI: forward AD with multi_margin_loss
-        xfail('scatter_reduce', 'mean'),  # NYI: forward-AD for scatter_reduce
-        xfail('scatter_reduce', 'prod'),  # NYI: forward-AD for scatter_reduce
         skip('linalg.householder_product', '', device_type='cuda'),  # flaky, I'm not sure why
-        xfail('native_layer_norm', ''),  # NYI: forward-AD for native_layer_norm_backward
         xfail('sparse.sampled_addmm', ''),  # Sparse tensors have no strides
         skip('as_strided_scatter', ''),  # seems flaky
         xfail('segment_reduce', 'offsets'),  # NYI: forward-AD for segment_reduce
@@ -1093,11 +1168,6 @@ def test_jvpvjp(self, device, dtype, op):
             primals_tangents = tree_map(lambda x: torch.randn_like(x), primals)
             cotangents_tangents = tree_map(lambda x: torch.randn_like(x), cotangents)
 
-            if isinstance(primals[0], torch.Tensor) and primals[0].numel() == 0:
-                # typically the first primal arg is the input. If the input has no elements, we will typically run
-                # into an issue of "Expected Tensor but got None"
-                continue
-
             def push_vjp(primals, cotangents):
                 _, vjp_fn = vjp(fn, *primals)
                 return vjp_fn(cotangents)
@@ -1127,37 +1197,154 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
                     expected = (tree_unflatten(primals_out, spec), tree_unflatten(tangents_out, spec))
                 return expected
 
-            # HACK: obviously pytorch should also have the same coverage
-            # For things that do have the same coverage, we test that jvp x vjp
-            # are the same between PyTorch and functorch. For things that don't,
-            # we check that jacfwd(vjp) and jacrev(vjp) are the same. This results
-            # in slower tests.
-            FUNCTORCH_HAS_FORMULA_BUT_NOT_PYTORCH = {
-                'nn.functional.nll_loss',
-                'softmax',
-                'log_softmax',
-                'nn.functional.cross_entropy',
-                'nn.functional.layer_norm',
-                'nn.functional.batch_norm',
-            }
-            if op.name in FUNCTORCH_HAS_FORMULA_BUT_NOT_PYTORCH:
-                self.assertFalse(op.supports_fwgrad_bwgrad,
-                                 f"{op.name} now supports forward over reverse without a decomposition. " +
-                                 "Please remove the decomposition version")
-
-                def is_differentiable(t):
-                    return isinstance(t, torch.Tensor) and t.dtype == torch.float32
-                args = (cotangents, *primals)
-                if op.name == 'nn.functional.binary_cross_entropy':
-                    argnums = (0, 1)  # targets is float32 but isn't differentiable
-                    atol_rtol = 1.5e-4, 1.3e-06
-                else:
-                    argnums = tuple(i for i in range(len(args)) if is_differentiable(args[i]))
-                    atol_rtol = None
-                self._compare_jacobians_of_vjp(fn, args, argnums, atol_rtol)
-            else:
-                expected = reference(primals, cotangents, primals_tangents, cotangents_tangents)
-                self.assertEqual(result, expected)
+            expected = reference(primals, cotangents, primals_tangents, cotangents_tangents)
+            self.assertEqual(result, expected)
+
+    @skipOps('TestOperators', 'test_vmapjvpvjp', vjp_fail.union({
+        # Following operatos take too long, hence skipped
+        skip('atleast_1d'),
+        skip('atleast_2d'),
+        skip('atleast_3d'),
+        skip('meshgrid', 'list_of_tensors'),
+        skip('meshgrid', 'variadic_tensors'),
+        skip('broadcast_tensors'),
+        skip('linalg.lstsq'),
+        skip('nn.functional.bilinear'),
+        skip('native_layer_norm'),
+
+        # Potential bugs/errors
+        xfail('_masked.cumprod'),  # calls item()
+        xfail('_masked.prod'),  # calls item()
+        xfail('as_strided'),  # AssertionError: Tensor-likes are not close!
+        xfail('as_strided_scatter'),  # AssertionError: Tensor-likes are not close!
+        xfail('bernoulli'),  # calls random op
+        xfail('bfloat16'),  # required rank 4 tensor to use channels_last format
+        xfail('cdist'),  # Forward AD not implemented and no decomposition
+        xfail('chalf'),  # required rank 4 tensor to use channels_last format
+        xfail('cholesky'),  # Forward AD not implemented and no decomposition
+        xfail('cumprod'),  # calls item()
+        xfail('double'),  # required rank 4 tensor to use channels_last format
+        xfail('float'),  # required rank 4 tensor to use channels_last format
+        xfail('half'),  # required rank 4 tensor to use channels_last format
+        xfail('index_reduce'),  # Forward AD not implemented and no decomposition
+        xfail('linalg.eig'),  # vmap over torch.allclose isn't supported yet.
+        # AssertionError: Tensor-likes are not close!
+        # Mismatched elements: 2 / 120 (1.7%)
+        # Greatest absolute difference: 0.09438323974609375
+        # Greatest relative difference: 0.00115722746596277
+        xfail('linalg.householder_product', device_type='cuda'),
+        xfail('linalg.vander'),  # calls item()
+        xfail('logcumsumexp'),  # Forward AD not implemented and no decomposition
+        xfail('mvlgamma', 'mvlgamma_p_1'),  # vmap: inplace into a regular tensor
+        xfail('mvlgamma', 'mvlgamma_p_3'),  # vmap: inplace into a regular tensor
+        xfail('mvlgamma', 'mvlgamma_p_5'),  # vmap: inplace into a regular tensor
+        xfail('nanquantile'),  # Batching rule not implemented for aten::equal
+        # RuntimeError: Batch norm got a batched tensor as input while the
+        # running_mean or running_var, which will be updated in place,
+        # were not batched.
+        xfail('nn.functional.batch_norm'),
+        xfail('nn.functional.batch_norm', 'without_cudnn'),
+        xfail('nn.functional.binary_cross_entropy'),  # vmap: inplace into a regular tensor
+        xfail('nn.functional.dropout2d'),  # calls random op
+        xfail('nn.functional.dropout3d'),  # calls random op
+        xfail('nn.functional.dropout'),  # calls random op
+        xfail('nn.functional.embedding_bag'),  # Forward AD not implemented and no decomposition
+        xfail('nn.functional.feature_alpha_dropout', 'with_train'),  # calls random op
+        xfail('nn.functional.fractional_max_pool2d'),  # calls random op
+        xfail('nn.functional.fractional_max_pool3d'),  # calls random op
+        xfail('nn.functional.gaussian_nll_loss'),  # data depenedant flow
+        xfail('nn.functional.grid_sample'),  # Forward AD not implemented and no decomposition
+        xfail('nn.functional.hardsigmoid'),  # Forward AD not implemented and no decomposition
+        xfail('nn.functional.hinge_embedding_loss'),  # vmap: inplace into a regular tensor
+        xfail('nn.functional.huber_loss'),  # Forward AD not implemented and no decomposition
+        # RuntimeError: Batch norm got a batched tensor as input while the
+        # running_mean or running_var, which will be updated in place,
+        # were not batched.
+        xfail('nn.functional.instance_norm'),
+        xfail('nn.functional.logsigmoid'),  # Forward AD not implemented and no decomposition
+        # NYI: Tensor.clone(memory_format) inside vmap is only supported with
+        # memory_format torch.preserve_format or torch.contiguous_format (got ChannelsLast)
+        xfail('nn.functional.max_pool2d', device_type='cuda'),  # AssertionError: Tensor-likes are not close!
+        xfail('nn.functional.max_unpool2d'),
+        xfail('nn.functional.max_unpool2d', 'grad'),
+        xfail('nn.functional.multi_margin_loss'),  # Forward AD not implemented and no decomposition
+        xfail('nn.functional.multilabel_margin_loss'),  # Forward AD not implemented and no decomposition
+        xfail('nn.functional.multilabel_soft_margin_loss'),  # Forward AD not implemented and no decomposition
+        xfail('nn.functional.pdist'),  # Forward AD not implemented and no decomposition
+        xfail('nn.functional.rrelu'),  # vmap: we do not yet support aten::rrelu_with_noise.
+        xfail('nn.functional.soft_margin_loss'),  # Forward AD not implemented and no decomposition
+        xfail('normal'),  # calls random op
+        xfail('normal', 'number_mean'),  # calls random op
+        xfail('pca_lowrank'),  # calls random op
+        xfail('prod'),  # Dynamic shape due to aten::nonzero call
+        xfail('quantile'),  # Batching rule not implemented for aten::equal
+        xfail('renorm'),  # Forward AD not implemented and no decomposition
+        xfail('scatter_reduce', 'prod'),  # Forward AD not implemented and no decomposition
+        xfail('segment_reduce', 'lengths'),  # Forward AD not implemented and no decomposition
+        xfail('segment_reduce', 'offsets'),  # Forward AD not implemented and no decomposition
+        xfail('sparse.sampled_addmm'),  # RuntimeError: Sparse CSR tensors do not have strides
+        xfail('svd_lowrank'),  # calls random op
+        xfail('symeig'),  # Forward AD not implemented and no decomposition
+        xfail('take'),  # vmap: inplace into regular tensor
+        xfail('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
+        xfail('to_sparse'),  # Forward AD not implemented and no decomposition
+        xfail('view_as_complex'),  # RuntimeError: Tensor must have a last dimension with stride 1
+    }))
+    @ops(op_db + additional_op_db, allowed_dtypes=(torch.float,))
+    @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
+    @opsToleranceOverride('TestOperators', 'test_vmapjvpvjp', (
+        tol1('linalg.svd',
+             {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
+        tol1('linalg.householder_product',
+             {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
+        tol1('linalg.multi_dot',
+             {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
+        tol1('svd',
+             {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
+    ))
+    def test_vmapjvpvjp(self, device, dtype, op):
+        # Since we test `jvpvjp` seperately,
+        # in this we just check that vmap of `jvpvjp`
+        # is correct.
+        if not op.supports_autograd:
+            self.skipTest("Skipped! Autograd not supported.")
+            return
+
+        samples = op.sample_inputs(device, dtype, requires_grad=True)
+
+        # TODO: test in-place
+        if is_inplace(op, op.get_op()):
+            self.skipTest("Skipped! NYI: inplace-testing not supported.")
+            return
+
+        for sample in samples:
+            fn, primals = normalize_op_input_output(op, sample)
+            result = fn(*primals)
+            cotangents = tree_map(lambda x: torch.randn_like(x), result)
+
+            primals_tangents = tree_map(lambda x: torch.randn_like(x), primals)
+            cotangents_tangents = tree_map(lambda x: torch.randn_like(x), cotangents)
+
+            def push_vjp(primals, cotangents):
+                _, vjp_fn = vjp(fn, *primals)
+                return vjp_fn(cotangents)
+
+            args, spec = tree_flatten(((primals, cotangents), (primals_tangents, cotangents_tangents)))
+
+            def jvp_of_vjp(*args):
+                (primals, tangents) = tree_unflatten(args, spec)
+                primals_out, tangents_out = jvp(push_vjp, primals, tangents)
+
+                flat_primals_out, _ = tree_flatten(primals_out)
+                flat_tangents_out, _ = tree_flatten(tangents_out)
+                return tuple(flat_primals_out + flat_tangents_out)
+
+            is_batch_norm_and_training = is_batch_norm_training(op, sample.kwargs)
+            generator = get_fallback_and_vmap_exhaustive(
+                jvp_of_vjp, args, {}, is_batch_norm_and_training=is_batch_norm_and_training)
+            for loop_out, batched_out in generator:
+                self.assertEqual(loop_out, batched_out)
+
 
     def _make_extremal_inputs(self, shape, device):
         if shape is None:
diff --git a/functorch/test/test_vmap.py b/functorch/test/test_vmap.py
index afb2684979e7..3a64063986e6 100644
--- a/functorch/test/test_vmap.py
+++ b/functorch/test/test_vmap.py
@@ -3197,6 +3197,7 @@ def test():
         xfail('masked_select'),  # dynamic op
         xfail('nonzero'),  # dynamic op
         xfail('allclose'),  # returns a boolean
+        xfail('uniform'),  # randomness is tested separately
         xfail('rand_like'),  # randomness is tested separately
         xfail('randint_like'),  # randomness is tested separately
         xfail('randn_like'),  # randomness is tested separately
@@ -3243,6 +3244,7 @@ def test():
         xfail('nn.functional.embedding_bag'),  # embedding renorm vmap inplace incompatible
         xfail('__rpow__'),  # https://github.com/pytorch/functorch/issues/617
         xfail('column_stack', ''),  # Batching rule not implemented for aten::column_stack
+        xfail('narrow'),  # Batching rule not implemented for aten::narrow.Tensor
 
         # required rank 4 tensor to use channels_last format
         xfail('bfloat16'),
@@ -3275,7 +3277,9 @@ def test():
              {torch.float32: tol(atol=1e-04, rtol=1e-02)}, device_type='cuda'),
     ))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
-    @skipOps('TestVmapOperatorsOpInfo', 'test_vmap_exhaustive', vmap_fail)
+    @skipOps('TestVmapOperatorsOpInfo', 'test_vmap_exhaustive', vmap_fail.union({
+        xfail('cat'),
+    }))
     def test_vmap_exhaustive(self, device, dtype, op):
         # needs to be fixed
         inplace_failure_list = (
@@ -3291,13 +3295,17 @@ def test_vmap_exhaustive(self, device, dtype, op):
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
     @skipOps('TestVmapOperatorsOpInfo', 'test_op_has_batch_rule', vmap_fail.union({
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
+        xfail('cat'),
         xfail('complex'),
         xfail('copysign'),
-        xfail('eig'),
         xfail('histogram'),
         xfail('index_fill'),
         xfail('nansum'),
         xfail('nanmean'),
+        xfail('scatter_reduce', 'sum'),
+        xfail('scatter_reduce', 'mean'),
+        xfail('scatter_reduce', 'amax'),
+        xfail('scatter_reduce', 'amin'),
         # `index_put` OpInfo in pytorch/pytorch has
         # masked index as input which is not supported
         xfail('index_put', ''),
@@ -3364,6 +3372,7 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('svd_lowrank', ''),
         xfail('diagflat', ''),
         xfail('special.log_ndtr'),
+        xfail('narrow'),  # Batching rule not implemented for aten::narrow.Tensor
         xfail('nn.functional.triplet_margin_loss', ''),
         xfail('nn.functional.pdist', ''),
         xfail('scatter_reduce', 'sum'),
@@ -3828,6 +3837,8 @@ def f(e_):
     @ops(filter(lambda op: "linalg" in op.name, op_db + additional_op_db), allowed_dtypes=(torch.float,))
     @skipOps('TestVmapOperatorsOpInfo', 'test_vmap_linalg_failure_1D_input', {
         xfail('linalg.vector_norm'),  # can accept vector inputs
+        xfail('linalg.norm'),  # can accept vector inputs
+        xfail('linalg.norm', 'subgradients_at_zero'),  # can accept vector inputs
         xfail('linalg.cross'),  # can accept vector inputs
         skip('linalg.multi_dot'),  # accepts list of tensor inputs, has its own special test
         xfail('linalg.vander'),
diff --git a/functorch/tools/lint/black_linter.py b/functorch/tools/lint/black_linter.py
deleted file mode 100644
index 9d259fe096b8..000000000000
--- a/functorch/tools/lint/black_linter.py
+++ /dev/null
@@ -1,228 +0,0 @@
-import argparse
-import concurrent.futures
-import json
-import logging
-import os
-import subprocess
-import sys
-import time
-from enum import Enum
-from typing import Any, List, NamedTuple, Optional, BinaryIO
-
-
-IS_WINDOWS: bool = os.name == "nt"
-
-
-def eprint(*args: Any, **kwargs: Any) -> None:
-    print(*args, file=sys.stderr, flush=True, **kwargs)
-
-
-class LintSeverity(str, Enum):
-    ERROR = "error"
-    WARNING = "warning"
-    ADVICE = "advice"
-    DISABLED = "disabled"
-
-
-class LintMessage(NamedTuple):
-    path: Optional[str]
-    line: Optional[int]
-    char: Optional[int]
-    code: str
-    severity: LintSeverity
-    name: str
-    original: Optional[str]
-    replacement: Optional[str]
-    description: Optional[str]
-
-
-def as_posix(name: str) -> str:
-    return name.replace("\\", "/") if IS_WINDOWS else name
-
-
-def _run_command(
-    args: List[str],
-    *,
-    stdin: BinaryIO,
-    timeout: int,
-) -> "subprocess.CompletedProcess[bytes]":
-    logging.debug("$ %s", " ".join(args))
-    start_time = time.monotonic()
-    try:
-        return subprocess.run(
-            args,
-            stdin=stdin,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            shell=IS_WINDOWS,  # So batch scripts are found.
-            timeout=timeout,
-            check=True,
-        )
-    finally:
-        end_time = time.monotonic()
-        logging.debug("took %dms", (end_time - start_time) * 1000)
-
-
-def run_command(
-    args: List[str],
-    *,
-    stdin: BinaryIO,
-    retries: int,
-    timeout: int,
-) -> "subprocess.CompletedProcess[bytes]":
-    remaining_retries = retries
-    while True:
-        try:
-            return _run_command(args, stdin=stdin, timeout=timeout)
-        except subprocess.TimeoutExpired as err:
-            if remaining_retries == 0:
-                raise err
-            remaining_retries -= 1
-            logging.warning(
-                "(%s/%s) Retrying because command failed with: %r",
-                retries - remaining_retries,
-                retries,
-                err,
-            )
-            time.sleep(1)
-
-
-def check_file(
-    filename: str,
-    retries: int,
-    timeout: int,
-) -> List[LintMessage]:
-    try:
-        with open(filename, "rb") as f:
-            original = f.read()
-        with open(filename, "rb") as f:
-            proc = run_command(
-                [sys.executable, "-mblack", "--stdin-filename", filename, "-"],
-                stdin=f,
-                retries=retries,
-                timeout=timeout,
-            )
-    except subprocess.TimeoutExpired:
-        return [
-            LintMessage(
-                path=filename,
-                line=None,
-                char=None,
-                code="BLACK",
-                severity=LintSeverity.ERROR,
-                name="timeout",
-                original=None,
-                replacement=None,
-                description=(
-                    "black timed out while trying to process a file. "
-                    "Please report an issue in pytorch/pytorch with the "
-                    "label 'module: lint'"
-                ),
-            )
-        ]
-    except (OSError, subprocess.CalledProcessError) as err:
-        return [
-            LintMessage(
-                path=filename,
-                line=None,
-                char=None,
-                code="BLACK",
-                severity=LintSeverity.ADVICE,
-                name="command-failed",
-                original=None,
-                replacement=None,
-                description=(
-                    f"Failed due to {err.__class__.__name__}:\n{err}"
-                    if not isinstance(err, subprocess.CalledProcessError)
-                    else (
-                        "COMMAND (exit code {returncode})\n"
-                        "{command}\n\n"
-                        "STDERR\n{stderr}\n\n"
-                        "STDOUT\n{stdout}"
-                    ).format(
-                        returncode=err.returncode,
-                        command=" ".join(as_posix(x) for x in err.cmd),
-                        stderr=err.stderr.decode("utf-8").strip() or "(empty)",
-                        stdout=err.stdout.decode("utf-8").strip() or "(empty)",
-                    )
-                ),
-            )
-        ]
-
-    replacement = proc.stdout
-    if original == replacement:
-        return []
-
-    return [
-        LintMessage(
-            path=filename,
-            line=None,
-            char=None,
-            code="BLACK",
-            severity=LintSeverity.WARNING,
-            name="format",
-            original=original.decode("utf-8"),
-            replacement=replacement.decode("utf-8"),
-            description="Run `lintrunner -a` to apply this patch.",
-        )
-    ]
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Format files with black.",
-        fromfile_prefix_chars="@",
-    )
-    parser.add_argument(
-        "--retries",
-        default=3,
-        type=int,
-        help="times to retry timed out black",
-    )
-    parser.add_argument(
-        "--timeout",
-        default=90,
-        type=int,
-        help="seconds to wait for black",
-    )
-    parser.add_argument(
-        "--verbose",
-        action="store_true",
-        help="verbose logging",
-    )
-    parser.add_argument(
-        "filenames",
-        nargs="+",
-        help="paths to lint",
-    )
-    args = parser.parse_args()
-
-    logging.basicConfig(
-        format="<%(threadName)s:%(levelname)s> %(message)s",
-        level=logging.NOTSET
-        if args.verbose
-        else logging.DEBUG
-        if len(args.filenames) < 1000
-        else logging.INFO,
-        stream=sys.stderr,
-    )
-
-    with concurrent.futures.ThreadPoolExecutor(
-        max_workers=os.cpu_count(),
-        thread_name_prefix="Thread",
-    ) as executor:
-        futures = {
-            executor.submit(check_file, x, args.retries, args.timeout): x
-            for x in args.filenames
-        }
-        for future in concurrent.futures.as_completed(futures):
-            try:
-                for lint_message in future.result():
-                    print(json.dumps(lint_message._asdict()), flush=True)
-            except Exception:
-                logging.critical('Failed at "%s".', futures[future])
-                raise
-
-
-if __name__ == "__main__":
-    main()
diff --git a/functorch/tools/lint/flake8_linter.py b/functorch/tools/lint/flake8_linter.py
deleted file mode 100644
index 20274432566c..000000000000
--- a/functorch/tools/lint/flake8_linter.py
+++ /dev/null
@@ -1,373 +0,0 @@
-import argparse
-import json
-import logging
-import os
-import re
-import subprocess
-import sys
-import time
-from enum import Enum
-from typing import Any, Dict, List, NamedTuple, Optional, Set, Pattern
-
-
-IS_WINDOWS: bool = os.name == "nt"
-
-
-def eprint(*args: Any, **kwargs: Any) -> None:
-    print(*args, file=sys.stderr, flush=True, **kwargs)
-
-
-class LintSeverity(str, Enum):
-    ERROR = "error"
-    WARNING = "warning"
-    ADVICE = "advice"
-    DISABLED = "disabled"
-
-
-class LintMessage(NamedTuple):
-    path: Optional[str]
-    line: Optional[int]
-    char: Optional[int]
-    code: str
-    severity: LintSeverity
-    name: str
-    original: Optional[str]
-    replacement: Optional[str]
-    description: Optional[str]
-
-
-def as_posix(name: str) -> str:
-    return name.replace("\\", "/") if IS_WINDOWS else name
-
-
-# fmt: off
-# https://www.flake8rules.com/
-DOCUMENTED_IN_FLAKE8RULES: Set[str] = {
-    "E101", "E111", "E112", "E113", "E114", "E115", "E116", "E117",
-    "E121", "E122", "E123", "E124", "E125", "E126", "E127", "E128", "E129",
-    "E131", "E133",
-    "E201", "E202", "E203",
-    "E211",
-    "E221", "E222", "E223", "E224", "E225", "E226", "E227", "E228",
-    "E231",
-    "E241", "E242",
-    "E251",
-    "E261", "E262", "E265", "E266",
-    "E271", "E272", "E273", "E274", "E275",
-    "E301", "E302", "E303", "E304", "E305", "E306",
-    "E401", "E402",
-    "E501", "E502",
-    "E701", "E702", "E703", "E704",
-    "E711", "E712", "E713", "E714",
-    "E721", "E722",
-    "E731",
-    "E741", "E742", "E743",
-    "E901", "E902", "E999",
-    "W191",
-    "W291", "W292", "W293",
-    "W391",
-    "W503", "W504",
-    "W601", "W602", "W603", "W604", "W605",
-    "F401", "F402", "F403", "F404", "F405",
-    "F811", "F812",
-    "F821", "F822", "F823",
-    "F831",
-    "F841",
-    "F901",
-    "C901",
-}
-
-# https://pypi.org/project/flake8-comprehensions/#rules
-DOCUMENTED_IN_FLAKE8COMPREHENSIONS: Set[str] = {
-    "C400", "C401", "C402", "C403", "C404", "C405", "C406", "C407", "C408", "C409",
-    "C410",
-    "C411", "C412", "C413", "C413", "C414", "C415", "C416",
-}
-
-# https://github.com/PyCQA/flake8-bugbear#list-of-warnings
-DOCUMENTED_IN_BUGBEAR: Set[str] = {
-    "B001", "B002", "B003", "B004", "B005", "B006", "B007", "B008", "B009", "B010",
-    "B011", "B012", "B013", "B014", "B015",
-    "B301", "B302", "B303", "B304", "B305", "B306",
-    "B901", "B902", "B903", "B950",
-}
-# fmt: on
-
-
-# stdin:2: W802 undefined name 'foo'
-# stdin:3:6: T484 Name 'foo' is not defined
-# stdin:3:-100: W605 invalid escape sequence '\/'
-# stdin:3:1: E302 expected 2 blank lines, found 1
-RESULTS_RE: Pattern[str] = re.compile(
-    r"""(?mx)
-    ^
-    (?P<file>.*?):
-    (?P<line>\d+):
-    (?:(?P<column>-?\d+):)?
-    \s(?P<code>\S+?):?
-    \s(?P<message>.*)
-    $
-    """
-)
-
-
-def _test_results_re() -> None:
-    """
-    >>> def t(s): return RESULTS_RE.search(s).groupdict()
-
-    >>> t(r"file.py:80:1: E302 expected 2 blank lines, found 1")
-    ... # doctest: +NORMALIZE_WHITESPACE
-    {'file': 'file.py', 'line': '80', 'column': '1', 'code': 'E302',
-     'message': 'expected 2 blank lines, found 1'}
-
-    >>> t(r"file.py:7:1: P201: Resource `stdout` is acquired but not always released.")
-    ... # doctest: +NORMALIZE_WHITESPACE
-    {'file': 'file.py', 'line': '7', 'column': '1', 'code': 'P201',
-     'message': 'Resource `stdout` is acquired but not always released.'}
-
-    >>> t(r"file.py:8:-10: W605 invalid escape sequence '/'")
-    ... # doctest: +NORMALIZE_WHITESPACE
-    {'file': 'file.py', 'line': '8', 'column': '-10', 'code': 'W605',
-     'message': "invalid escape sequence '/'"}
-    """
-    pass
-
-
-def _run_command(
-    args: List[str],
-    *,
-    extra_env: Optional[Dict[str, str]],
-) -> "subprocess.CompletedProcess[str]":
-    logging.debug(
-        "$ %s",
-        " ".join(
-            ([f"{k}={v}" for (k, v) in extra_env.items()] if extra_env else []) + args
-        ),
-    )
-    start_time = time.monotonic()
-    try:
-        return subprocess.run(
-            args,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            check=True,
-            encoding="utf-8",
-        )
-    finally:
-        end_time = time.monotonic()
-        logging.debug("took %dms", (end_time - start_time) * 1000)
-
-
-def run_command(
-    args: List[str],
-    *,
-    extra_env: Optional[Dict[str, str]],
-    retries: int,
-) -> "subprocess.CompletedProcess[str]":
-    remaining_retries = retries
-    while True:
-        try:
-            return _run_command(args, extra_env=extra_env)
-        except subprocess.CalledProcessError as err:
-            if remaining_retries == 0 or not re.match(
-                r"^ERROR:1:1: X000 linting with .+ timed out after \d+ seconds",
-                err.stdout,
-            ):
-                raise err
-            remaining_retries -= 1
-            logging.warning(
-                "(%s/%s) Retrying because command failed with: %r",
-                retries - remaining_retries,
-                retries,
-                err,
-            )
-            time.sleep(1)
-
-
-def get_issue_severity(code: str) -> LintSeverity:
-    # "B901": `return x` inside a generator
-    # "B902": Invalid first argument to a method
-    # "B903": __slots__ efficiency
-    # "B950": Line too long
-    # "C4": Flake8 Comprehensions
-    # "C9": Cyclomatic complexity
-    # "E2": PEP8 horizontal whitespace "errors"
-    # "E3": PEP8 blank line "errors"
-    # "E5": PEP8 line length "errors"
-    # "F401": Name imported but unused
-    # "F403": Star imports used
-    # "F405": Name possibly from star imports
-    # "T400": type checking Notes
-    # "T49": internal type checker errors or unmatched messages
-    if any(
-        code.startswith(x)
-        for x in [
-            "B9",
-            "C4",
-            "C9",
-            "E2",
-            "E3",
-            "E5",
-            "F401",
-            "F403",
-            "F405",
-            "T400",
-            "T49",
-        ]
-    ):
-        return LintSeverity.ADVICE
-
-    # "F821": Undefined name
-    # "E999": syntax error
-    if any(code.startswith(x) for x in ["F821", "E999"]):
-        return LintSeverity.ERROR
-
-    # "F": PyFlakes Error
-    # "B": flake8-bugbear Error
-    # "E": PEP8 "Error"
-    # "W": PEP8 Warning
-    # possibly other plugins...
-    return LintSeverity.WARNING
-
-
-def get_issue_documentation_url(code: str) -> str:
-    if code in DOCUMENTED_IN_FLAKE8RULES:
-        return f"https://www.flake8rules.com/rules/{code}.html"
-
-    if code in DOCUMENTED_IN_FLAKE8COMPREHENSIONS:
-        return "https://pypi.org/project/flake8-comprehensions/#rules"
-
-    if code in DOCUMENTED_IN_BUGBEAR:
-        return "https://github.com/PyCQA/flake8-bugbear#list-of-warnings"
-
-    return ""
-
-
-def check_files(
-    filenames: List[str],
-    flake8_plugins_path: Optional[str],
-    severities: Dict[str, LintSeverity],
-    retries: int,
-) -> List[LintMessage]:
-    try:
-        proc = run_command(
-            [sys.executable, "-mflake8", "--exit-zero"] + filenames,
-            extra_env={"FLAKE8_PLUGINS_PATH": flake8_plugins_path}
-            if flake8_plugins_path
-            else None,
-            retries=retries,
-        )
-    except (OSError, subprocess.CalledProcessError) as err:
-        return [
-            LintMessage(
-                path=None,
-                line=None,
-                char=None,
-                code="FLAKE8",
-                severity=LintSeverity.ERROR,
-                name="command-failed",
-                original=None,
-                replacement=None,
-                description=(
-                    f"Failed due to {err.__class__.__name__}:\n{err}"
-                    if not isinstance(err, subprocess.CalledProcessError)
-                    else (
-                        "COMMAND (exit code {returncode})\n"
-                        "{command}\n\n"
-                        "STDERR\n{stderr}\n\n"
-                        "STDOUT\n{stdout}"
-                    ).format(
-                        returncode=err.returncode,
-                        command=" ".join(as_posix(x) for x in err.cmd),
-                        stderr=err.stderr.strip() or "(empty)",
-                        stdout=err.stdout.strip() or "(empty)",
-                    )
-                ),
-            )
-        ]
-
-    return [
-        LintMessage(
-            path=match["file"],
-            name=match["code"],
-            description="{}\nSee {}".format(
-                match["message"],
-                get_issue_documentation_url(match["code"]),
-            ),
-            line=int(match["line"]),
-            char=int(match["column"])
-            if match["column"] is not None and not match["column"].startswith("-")
-            else None,
-            code="FLAKE8",
-            severity=severities.get(match["code"]) or get_issue_severity(match["code"]),
-            original=None,
-            replacement=None,
-        )
-        for match in RESULTS_RE.finditer(proc.stdout)
-    ]
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Flake8 wrapper linter.",
-        fromfile_prefix_chars="@",
-    )
-    parser.add_argument(
-        "--flake8-plugins-path",
-        help="FLAKE8_PLUGINS_PATH env value",
-    )
-    parser.add_argument(
-        "--severity",
-        action="append",
-        help="map code to severity (e.g. `B950:advice`)",
-    )
-    parser.add_argument(
-        "--retries",
-        default=3,
-        type=int,
-        help="times to retry timed out flake8",
-    )
-    parser.add_argument(
-        "--verbose",
-        action="store_true",
-        help="verbose logging",
-    )
-    parser.add_argument(
-        "filenames",
-        nargs="+",
-        help="paths to lint",
-    )
-    args = parser.parse_args()
-
-    logging.basicConfig(
-        format="<%(threadName)s:%(levelname)s> %(message)s",
-        level=logging.NOTSET
-        if args.verbose
-        else logging.DEBUG
-        if len(args.filenames) < 1000
-        else logging.INFO,
-        stream=sys.stderr,
-    )
-
-    flake8_plugins_path = (
-        None
-        if args.flake8_plugins_path is None
-        else os.path.realpath(args.flake8_plugins_path)
-    )
-
-    severities: Dict[str, LintSeverity] = {}
-    if args.severity:
-        for severity in args.severity:
-            parts = severity.split(":", 1)
-            assert len(parts) == 2, f"invalid severity `{severity}`"
-            severities[parts[0]] = LintSeverity(parts[1])
-
-    lint_messages = check_files(
-        args.filenames, flake8_plugins_path, severities, args.retries
-    )
-    for lint_message in lint_messages:
-        print(json.dumps(lint_message._asdict()), flush=True)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/functorch/tools/lint/pip_init.py b/functorch/tools/lint/pip_init.py
deleted file mode 100644
index db1f69d26b22..000000000000
--- a/functorch/tools/lint/pip_init.py
+++ /dev/null
@@ -1,75 +0,0 @@
-"""
-Initializer script that installs stuff to pip.
-"""
-import os
-import argparse
-import logging
-import subprocess
-import sys
-import time
-
-from typing import List
-
-
-def run_command(args: List[str]) -> "subprocess.CompletedProcess[bytes]":
-    logging.debug("$ %s", " ".join(args))
-    start_time = time.monotonic()
-    try:
-        return subprocess.run(args, check=True)
-    finally:
-        end_time = time.monotonic()
-        logging.debug("took %dms", (end_time - start_time) * 1000)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="pip initializer")
-    parser.add_argument(
-        "packages",
-        nargs="+",
-        help="pip packages to install",
-    )
-    parser.add_argument(
-        "--verbose",
-        action="store_true",
-        help="verbose logging",
-    )
-    parser.add_argument(
-        "--dry-run", help="do not install anything, just print what would be done."
-    )
-
-    args = parser.parse_args()
-
-    logging.basicConfig(
-        format="<%(threadName)s:%(levelname)s> %(message)s",
-        level=logging.NOTSET if args.verbose else logging.DEBUG,
-        stream=sys.stderr,
-    )
-
-    for package in args.packages:
-        package_name, _, version = package.partition("=")
-        if version == "":
-            raise RuntimeError(
-                "Package {package_name} did not have a version specified. "
-                "Please specify a version to product a consistent linting experience."
-            )
-    pip_args = ["pip3", "install"]
-
-    # If we are in a global install, use `--user` to install so that you do not
-    # need root access in order to initialize linters.
-    #
-    # However, `pip install --user` interacts poorly with virtualenvs (see:
-    # https://bit.ly/3vD4kvl) and conda (see: https://bit.ly/3KG7ZfU). So in
-    # these cases perform a regular installation.
-    in_conda = os.environ.get("CONDA_PREFIX") is not None
-    in_virtualenv = os.environ.get("VIRTUAL_ENV") is not None
-    if not in_conda and not in_virtualenv:
-        pip_args.append("--user")
-
-    pip_args.extend(args.packages)
-
-    dry_run = args.dry_run == "1"
-    if dry_run:
-        print(f"Would have run: {pip_args}")
-        sys.exit(0)
-
-    run_command(pip_args)
diff --git a/scripts/build_mobile.sh b/scripts/build_mobile.sh
index 0cc49301baf1..902458b2350e 100755
--- a/scripts/build_mobile.sh
+++ b/scripts/build_mobile.sh
@@ -19,6 +19,7 @@ CMAKE_ARGS+=("-DCMAKE_PREFIX_PATH=$(python -c 'import sysconfig; print(sysconfig
 CMAKE_ARGS+=("-DPYTHON_EXECUTABLE=$(python -c 'import sys; print(sys.executable)')")
 CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=OFF")
 CMAKE_ARGS+=("-DBUILD_SHARED_LIBS=OFF")
+
 # custom build with selected ops
 if [ -n "${SELECTED_OP_LIST}" ]; then
   SELECTED_OP_LIST="$(cd $(dirname $SELECTED_OP_LIST); pwd -P)/$(basename $SELECTED_OP_LIST)"
@@ -35,6 +36,32 @@ if [ -x "$(command -v ninja)" ]; then
   CMAKE_ARGS+=("-GNinja")
 fi
 
+# Don't build artifacts we don't need
+CMAKE_ARGS+=("-DBUILD_TEST=OFF")
+CMAKE_ARGS+=("-DBUILD_BINARY=OFF")
+
+# If there exists env variable and it equals to 1, build lite interpreter.
+# Default behavior is to build full jit interpreter.
+# cmd:  BUILD_LITE_INTERPRETER=1 ./scripts/build_mobile.sh
+if [ "x${BUILD_LITE_INTERPRETER}" == "x1" ]; then
+  CMAKE_ARGS+=("-DBUILD_LITE_INTERPRETER=ON")
+else
+  CMAKE_ARGS+=("-DBUILD_LITE_INTERPRETER=OFF")
+fi
+if [ "x${TRACING_BASED}" == "x1" ]; then
+  CMAKE_ARGS+=("-DTRACING_BASED=ON")
+else
+  CMAKE_ARGS+=("-DTRACING_BASED=OFF")
+fi
+
+# Lightweight dispatch bypasses the PyTorch Dispatcher.
+if [ "${USE_LIGHTWEIGHT_DISPATCH}" == 1 ]; then
+  CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=ON")
+  CMAKE_ARGS+=("-DSTATIC_DISPATCH_BACKEND=CPU")
+else
+  CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=OFF")
+fi
+
 # Disable unused dependencies
 CMAKE_ARGS+=("-DUSE_ROCM=OFF")
 CMAKE_ARGS+=("-DUSE_CUDA=OFF")
@@ -45,6 +72,10 @@ CMAKE_ARGS+=("-DUSE_LMDB=OFF")
 CMAKE_ARGS+=("-DUSE_LEVELDB=OFF")
 CMAKE_ARGS+=("-DUSE_MPI=OFF")
 CMAKE_ARGS+=("-DUSE_OPENMP=OFF")
+CMAKE_ARGS+=("-DUSE_MKLDNN=OFF")
+CMAKE_ARGS+=("-DUSE_NNPACK=OFF")
+CMAKE_ARGS+=("-DUSE_NUMPY=OFF")
+CMAKE_ARGS+=("-DUSE_BLAS=OFF")
 
 # Only toggle if VERBOSE=1
 if [ "${VERBOSE:-}" == '1' ]; then
diff --git a/setup.py b/setup.py
index 00166f9b93a6..bbb15d5a46d2 100644
--- a/setup.py
+++ b/setup.py
@@ -322,7 +322,7 @@ def get_submodule_folders():
     git_modules_path = os.path.join(cwd, ".gitmodules")
     default_modules_path = [os.path.join(third_party_path, name) for name in [
                             "gloo", "cpuinfo", "tbb", "onnx",
-                            "foxi", "QNNPACK", "fbgemm"
+                            "foxi", "QNNPACK", "fbgemm", "cutlass"
                             ]]
     if not os.path.exists(git_modules_path):
         return default_modules_path
@@ -614,6 +614,23 @@ def build_extensions(self):
                     os.makedirs(dst_dir)
                 self.copy_file(src, dst)
                 i += 1
+
+        # Copy functorch extension
+        for i, ext in enumerate(self.extensions):
+            if ext.name != "functorch._C":
+                continue
+            fullname = self.get_ext_fullname(ext.name)
+            filename = self.get_ext_filename(fullname)
+            fileext = os.path.splitext(filename)[1]
+            src = os.path.join(os.path.dirname(filename), "functorch" + fileext)
+            dst = os.path.join(os.path.realpath(self.build_lib), filename)
+            if os.path.exists(src):
+                report("Copying {} from {} to {}".format(ext.name, src, dst))
+                dst_dir = os.path.dirname(dst)
+                if not os.path.exists(dst_dir):
+                    os.makedirs(dst_dir)
+                self.copy_file(src, dst)
+
         setuptools.command.build_ext.build_ext.build_extensions(self)
 
 
@@ -893,6 +910,12 @@ def make_relative_rpath_args(path):
                     name=str('caffe2.python.caffe2_pybind11_state_hip'),
                     sources=[]),
             )
+    if cmake_cache_vars['BUILD_FUNCTORCH']:
+        extensions.append(
+            Extension(
+                name=str('functorch._C'),
+                sources=[]),
+        )
 
     cmdclass = {
         'bdist_wheel': wheel_concatenate,
@@ -963,7 +986,7 @@ def main():
     with open(os.path.join(cwd, "README.md"), encoding="utf-8") as f:
         long_description = f.read()
 
-    version_range_max = max(sys.version_info[1], 9) + 1
+    version_range_max = max(sys.version_info[1], 10) + 1
     torch_package_data = [
         'py.typed',
         'bin/*',
@@ -996,6 +1019,7 @@ def main():
         'include/ATen/cuda/detail/*.cuh',
         'include/ATen/cuda/detail/*.h',
         'include/ATen/cudnn/*.h',
+        'include/ATen/functorch/*.h',
         'include/ATen/ops/*.h',
         'include/ATen/hip/*.cuh',
         'include/ATen/hip/*.h',
@@ -1082,6 +1106,7 @@ def main():
         'include/torch/csrc/jit/codegen/cuda/scheduler/*.h',
         'include/torch/csrc/onnx/*.h',
         'include/torch/csrc/profiler/*.h',
+        'include/torch/csrc/profiler/orchestration/*.h',
         'include/torch/csrc/utils/*.h',
         'include/torch/csrc/tensor/*.h',
         'include/torch/csrc/lazy/backend/*.h',
diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index 67c79bd22ee3..aeaf09aae645 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -91,15 +91,6 @@
     "Union",
     "get_combined_dict"
   ],
-  "torch.ao.quantization.backend_config.utils": [
-    "Any",
-    "Dict",
-    "Callable",
-    "List",
-    "Union",
-    "Tuple",
-    "Pattern"
-  ],
   "torch.ao.quantization.backend_config.native": [
     "Any",
     "Dict",
@@ -340,6 +331,7 @@
     "Dict",
     "List",
     "Optional",
+    "Set",
     "Tuple",
     "Union",
     "classproperty"
@@ -414,18 +406,6 @@
     "ProcessGroupMPI",
     "ProcessGroupNCCL"
   ],
-  "torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks": [
-    "Any",
-    "GradBucket"
-  ],
-  "torch.distributed.algorithms.ddp_comm_hooks.default_hooks": [
-    "Any",
-    "Callable"
-  ],
-  "torch.distributed.algorithms.ddp_comm_hooks.optimizer_overlap_hooks": [
-    "Any",
-    "Callable"
-  ],
   "torch.distributed.algorithms.model_averaging.utils": [
     "Dict",
     "Iterable",
@@ -439,34 +419,6 @@
     "backward",
     "get_gradients"
   ],
-  "torch.distributed.distributed_c10d": [
-    "AllToAllOptions",
-    "AllreduceCoalescedOptions",
-    "AllreduceOptions",
-    "BarrierOptions",
-    "BroadcastOptions",
-    "Callable",
-    "DebugLevel",
-    "Dict",
-    "GatherOptions",
-    "Optional",
-    "PrefixStore",
-    "ProcessGroup",
-    "ProcessGroupGloo",
-    "ReduceOp",
-    "ReduceOptions",
-    "ReduceScatterOptions",
-    "ScatterOptions",
-    "Store",
-    "Tuple",
-    "Union",
-    "get_debug_level",
-    "register_rendezvous_handler",
-    "rendezvous",
-    "timedelta",
-    "ProcessGroupMPI",
-    "ProcessGroupNCCL"
-  ],
   "torch.distributed.elastic.events": [
     "Dict",
     "Enum",
@@ -1225,6 +1177,9 @@
   "torch.multiprocessing.spawn": [
     "Optional"
   ],
+  "torch.nested": [
+    "to_padded_tensor"
+  ],
   "torch.nn.common_types": [
     "Optional",
     "Tensor",
diff --git a/test/cpp/api/CMakeLists.txt b/test/cpp/api/CMakeLists.txt
index f1e1765af5de..17ae64ab74ee 100644
--- a/test/cpp/api/CMakeLists.txt
+++ b/test/cpp/api/CMakeLists.txt
@@ -18,6 +18,7 @@ set(TORCH_API_TEST_SOURCES
   ${TORCH_API_TEST_DIR}/moduledict.cpp
   ${TORCH_API_TEST_DIR}/modulelist.cpp
   ${TORCH_API_TEST_DIR}/modules.cpp
+  ${TORCH_API_TEST_DIR}/nested.cpp
   ${TORCH_API_TEST_DIR}/parameterdict.cpp
   ${TORCH_API_TEST_DIR}/parameterlist.cpp
   ${TORCH_API_TEST_DIR}/namespace.cpp
diff --git a/test/cpp/api/autograd.cpp b/test/cpp/api/autograd.cpp
index b550802dde38..1012e23147e5 100644
--- a/test/cpp/api/autograd.cpp
+++ b/test/cpp/api/autograd.cpp
@@ -291,6 +291,42 @@ TEST(CustomAutogradTest, CustomFunction) {
   ASSERT_VARIABLE_EQ(y.grad(), x + torch::ones({5, 5}) * 2);
 }
 
+TEST(CustomAutogradTest, CustomFunctionWithTensorList) {
+  struct MyFunction : public Function<MyFunction> {
+    static Variable forward(AutogradContext* ctx, at::TensorList tensors) {
+      torch::autograd::variable_list vars;
+      for (const at::Tensor& tensor : tensors) {
+        vars.push_back(tensor);
+      }
+      ctx->save_for_backward(vars);
+      return tensors[0] + tensors[1] + tensors[0] * tensors[1];
+    }
+
+    static variable_list backward(
+        AutogradContext* ctx,
+        variable_list grad_output) {
+      auto saved = ctx->get_saved_variables();
+      auto var1 = saved[0];
+      auto var2 = saved[1];
+      variable_list output = {
+          grad_output[0] + grad_output[0] * var2,
+          grad_output[0] + grad_output[0] * var1};
+      return output;
+    }
+  };
+
+  at::Tensor x = torch::randn({5, 5}, torch::requires_grad());
+  at::Tensor y = torch::randn({5, 5}, torch::requires_grad());
+  torch::autograd::variable_list variables = {x, y};
+  at::TensorList tensors = variables;
+  auto res = MyFunction::apply(tensors);
+  auto go = torch::ones({}, torch::requires_grad());
+  res.sum().backward(go, false, true);
+
+  ASSERT_VARIABLE_EQ(x.grad(), y + torch::ones({5, 5}));
+  ASSERT_VARIABLE_EQ(y.grad(), x + torch::ones({5, 5}));
+}
+
 TEST(CustomAutogradTest, GraphTaskTrimEdges) {
   struct MyFunction : public Function<MyFunction> {
     static Variable forward(
@@ -1214,19 +1250,19 @@ std::tuple<torch::Tensor, torch::Tensor, int64_t> ret_tuple_non_tensor(
 }
 
 torch::Tensor view_op(const torch::Tensor& self) {
-  return self;
+  return self.alias();
 }
 
 torch::Tensor view_op_with_extra_arg(
     const torch::Tensor& self,
     const torch::Tensor& other) {
-  return self;
+  return self.alias();
 }
 
 std::vector<torch::Tensor> ret_tensor_vector_view(
     const torch::Tensor& self,
     const torch::Tensor& other) {
-  return {self, self};
+  return {self.alias(), self.alias()};
 }
 
 std::vector<at::Tensor> ret_tensor_vector(
diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp
index 1090aeafd1b4..cc2934ea2274 100644
--- a/test/cpp/api/functional.cpp
+++ b/test/cpp/api/functional.cpp
@@ -2252,6 +2252,20 @@ TEST_F(FunctionalTest, Interpolate) {
     auto output = F::interpolate(tensor, options);
     ASSERT_TRUE(output.allclose(expected));
   }
+  {
+    auto tensor = torch::rand({2, 3, 32, 32});
+    std::vector<int64_t> osize = {8, 10};
+    auto expected = at::native::_upsample_bicubic2d_aa(
+        tensor, osize, false, torch::nullopt);
+
+    auto options = F::InterpolateFuncOptions()
+                       .size(osize)
+                       .mode(torch::kBicubic)
+                       .align_corners(false)
+                       .antialias(true);
+    auto output = F::interpolate(tensor, options);
+    ASSERT_TRUE(output.allclose(expected));
+  }
 }
 
 TEST_F(FunctionalTest, Pad1) {
diff --git a/test/cpp/api/nested.cpp b/test/cpp/api/nested.cpp
new file mode 100644
index 000000000000..938e05e4d44c
--- /dev/null
+++ b/test/cpp/api/nested.cpp
@@ -0,0 +1,15 @@
+#include <gtest/gtest.h>
+
+#include <torch/nested.h>
+#include <torch/torch.h>
+
+#include <test/cpp/api/support.h>
+
+// Simple test that verifies the nested namespace is registered properly
+//   properly in C++
+TEST(NestedTest, Nested) {
+  auto a = torch::randn({2, 3});
+  auto b = torch::randn({4, 5});
+  auto nt = torch::nested_tensor({a, b});
+  torch::nested::to_padded_tensor(nt, 0);
+}
diff --git a/test/cpp/c10d/ProcessGroupGlooAsyncTest.cpp b/test/cpp/c10d/ProcessGroupGlooAsyncTest.cpp
index b01117f6dd2d..750b5cd6965b 100644
--- a/test/cpp/c10d/ProcessGroupGlooAsyncTest.cpp
+++ b/test/cpp/c10d/ProcessGroupGlooAsyncTest.cpp
@@ -93,7 +93,7 @@ class AsyncInputIsOutputTest : public AsyncTest {
     }
   }
 
-  void wait(c10::intrusive_ptr<c10d::ProcessGroup::Work>& work) {
+  void wait(c10::intrusive_ptr<c10d::Work>& work) {
     c10::cuda::CUDAMultiStreamGuard guard(streams_);
     work->wait();
   }
@@ -129,7 +129,7 @@ class AsyncAllreduceTest : public AsyncInputIsOutputTest {
   AsyncAllreduceTest(const std::string& path, int numTensors)
       : AsyncInputIsOutputTest(path, numTensors) {}
 
-  c10::intrusive_ptr<c10d::ProcessGroup::Work> run() {
+  c10::intrusive_ptr<c10d::Work> run() {
     // For the duration of this function, make THC use our streams
     c10::cuda::CUDAMultiStreamGuard guard(streams_);
 
@@ -155,9 +155,7 @@ class AsyncBroadcastTest : public AsyncInputIsOutputTest {
   AsyncBroadcastTest(const std::string& path, int numTensors)
       : AsyncInputIsOutputTest(path, numTensors) {}
 
-  c10::intrusive_ptr<c10d::ProcessGroup::Work> run(
-      int rootRank,
-      int rootTensor) {
+  c10::intrusive_ptr<c10d::Work> run(int rootRank, int rootTensor) {
     // For the duration of this function, make THC use our streams
     c10::cuda::CUDAMultiStreamGuard guard(streams_);
 
@@ -186,7 +184,7 @@ void runAsyncAllreduceTest(
     size_t numProcesses = 4,
     size_t numTensors = 2) {
   auto tests = initialize<AsyncAllreduceTest>(path, numProcesses, numTensors);
-  std::vector<c10::intrusive_ptr<c10d::ProcessGroup::Work>> work(numProcesses);
+  std::vector<c10::intrusive_ptr<c10d::Work>> work(numProcesses);
   for (const auto i : c10::irange(numProcesses)) {
     work[i] = tests[i].run();
   }
@@ -230,8 +228,7 @@ void runAsyncBroadcastTest(
   // Try every permutation of root rank and root tensor
   for (const auto rootRank : c10::irange(numProcesses)) {
     for (const auto rootTensor : c10::irange(numTensors)) {
-      std::vector<c10::intrusive_ptr<c10d::ProcessGroup::Work>> work(
-          numProcesses);
+      std::vector<c10::intrusive_ptr<c10d::Work>> work(numProcesses);
       for (const auto i : c10::irange(numProcesses)) {
         work[i] = tests[i].run(rootRank, rootTensor);
       }
diff --git a/test/cpp/c10d/ProcessGroupGlooTest.cpp b/test/cpp/c10d/ProcessGroupGlooTest.cpp
index 394215b0b7e3..a01ebfb657bd 100644
--- a/test/cpp/c10d/ProcessGroupGlooTest.cpp
+++ b/test/cpp/c10d/ProcessGroupGlooTest.cpp
@@ -47,7 +47,7 @@ class SignalTest {
     });
   }
 
-  c10::intrusive_ptr<::c10d::ProcessGroup::Work> run(int rank, int size) {
+  c10::intrusive_ptr<::c10d::Work> run(int rank, int size) {
     auto store = c10::make_intrusive<::c10d::FileStore>(path_, size);
 
     auto options = ::c10d::ProcessGroupGloo::Options::create();
@@ -65,7 +65,7 @@ class SignalTest {
     };
 
     // Loop until an exception happens
-    c10::intrusive_ptr<::c10d::ProcessGroup::Work> work;
+    c10::intrusive_ptr<::c10d::Work> work;
     while (true) {
       work = pg.allreduce(tensors);
       try {
@@ -85,7 +85,7 @@ class SignalTest {
   Semaphore sem_;
 };
 
-c10::intrusive_ptr<::c10d::ProcessGroup::Work> testSignal(
+c10::intrusive_ptr<::c10d::Work> testSignal(
     const std::string& path,
     int signal) {
   Fork fork;
@@ -110,7 +110,7 @@ class ProcessGroupGlooDelayed : public ::c10d::ProcessGroupGloo {
       c10::intrusive_ptr<Options> options)
       : ProcessGroupGloo(store, rank, size, options) {}
 
-  c10::intrusive_ptr<::c10d::ProcessGroup::Work> send(
+  c10::intrusive_ptr<::c10d::Work> send(
       std::vector<at::Tensor>& tensors,
       int dstRank,
       int tag) override {
@@ -192,7 +192,7 @@ std::vector<std::vector<at::Tensor>> copyTensors(
 }
 
 std::vector<std::vector<at::Tensor>> waitWork(
-    std::vector<c10::intrusive_ptr<c10d::ProcessGroup::Work>> works) {
+    std::vector<c10::intrusive_ptr<c10d::Work>> works) {
   std::vector<std::vector<at::Tensor>> outputTensors;
   for (auto& work : works) {
     try {
@@ -206,7 +206,7 @@ std::vector<std::vector<at::Tensor>> waitWork(
 }
 
 std::vector<std::vector<at::Tensor>> waitFuture(
-    std::vector<c10::intrusive_ptr<c10d::ProcessGroup::Work>> works) {
+    std::vector<c10::intrusive_ptr<c10d::Work>> works) {
   std::vector<std::vector<at::Tensor>> outputTensors;
   for (auto& work : works) {
     auto fut = work->getFuture();
@@ -274,7 +274,7 @@ void testAllreduce(const std::string& path, const at::DeviceType b) {
   }
 
   // Kick off work
-  std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> work(size);
+  std::vector<c10::intrusive_ptr<::c10d::Work>> work(size);
   const char* GLOO_ALLREDUCE_STR = "gloo:all_reduce";
   enableProfilerLegacy(ProfilerConfig(
       ProfilerState::CPU, /* report_input_shapes */ true, false));
@@ -319,7 +319,7 @@ void testAllreduceUsingWorkAPI(
   }
 
   // Kick off work
-  std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> work(size);
+  std::vector<c10::intrusive_ptr<::c10d::Work>> work(size);
   const char* GLOO_ALLREDUCE_STR = "gloo:all_reduce";
   enableProfilerLegacy(ProfilerConfig(
       ProfilerState::CPU, /* report_input_shapes */ true, false));
@@ -378,7 +378,7 @@ void testBroadcast(const std::string& path, const at::DeviceType b) {
       const char* GLOO_BROADCAST_STR = "gloo:broadcast";
       enableProfilerLegacy(ProfilerConfig(
           ProfilerState::CPU, /* report_input_shapes */ true, false));
-      std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> work(size);
+      std::vector<c10::intrusive_ptr<::c10d::Work>> work(size);
 
       for (const auto i : c10::irange(size)) {
         work[i] = tests[i].getProcessGroup().broadcast(inputs[i], options);
@@ -446,7 +446,7 @@ void testAlltoall(const std::string& path, const at::DeviceType b) {
   };
 
   // Kick off work
-  std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> work(size);
+  std::vector<c10::intrusive_ptr<::c10d::Work>> work(size);
   const char* GLOO_A2A_STR = "gloo:all_to_all";
   std::vector<std::vector<int64_t>> allShapes;
   for (const auto& vec : inputSplits) {
@@ -495,7 +495,7 @@ void testBarrier(const std::string& path) {
   // Kick off work
   enableProfilerLegacy(ProfilerConfig(
       ProfilerState::CPU, /* report_input_shapes */ true, false));
-  std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> work(size);
+  std::vector<c10::intrusive_ptr<::c10d::Work>> work(size);
   for (const auto i : c10::irange(size)) {
     work[i] = tests[i].getProcessGroup().barrier();
   }
diff --git a/test/cpp/c10d/ProcessGroupMPITest.cpp b/test/cpp/c10d/ProcessGroupMPITest.cpp
index 9a779e8fa122..5cbbfcd41fd6 100644
--- a/test/cpp/c10d/ProcessGroupMPITest.cpp
+++ b/test/cpp/c10d/ProcessGroupMPITest.cpp
@@ -15,7 +15,7 @@
 // Wait for work to complete
 std::vector<std::vector<at::Tensor>> waitWork(
     c10::intrusive_ptr<::c10d::ProcessGroupMPI> pg,
-    std::vector<c10::intrusive_ptr<c10d::ProcessGroup::Work>> works) {
+    std::vector<c10::intrusive_ptr<c10d::Work>> works) {
   std::vector<std::vector<at::Tensor>> outputTensors;
   for (auto& work : works) {
     try {
@@ -32,7 +32,7 @@ std::vector<std::vector<at::Tensor>> waitWork(
 // Wait using Futures
 std::vector<std::vector<at::Tensor>> waitFuture(
     c10::intrusive_ptr<::c10d::ProcessGroupMPI> pg,
-    std::vector<c10::intrusive_ptr<c10d::ProcessGroup::Work>> works) {
+    std::vector<c10::intrusive_ptr<c10d::Work>> works) {
   std::vector<std::vector<at::Tensor>> outputTensors;
   for (auto& work : works) {
     auto fut = work->getFuture();
@@ -58,14 +58,13 @@ void testAllreduce(int iter = 1000) {
   auto pg = c10d::ProcessGroupMPI::createProcessGroupMPI();
 
   // Generate inputs
-  std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> works;
+  std::vector<c10::intrusive_ptr<::c10d::Work>> works;
   for (const auto i : c10::irange(iter)) {
     auto tensor = at::ones({16, 16}) * i;
     std::vector<at::Tensor> tensors = {tensor};
 
     // Queue the work.
-    c10::intrusive_ptr<::c10d::ProcessGroup::Work> work =
-        pg->allreduce(tensors);
+    c10::intrusive_ptr<::c10d::Work> work = pg->allreduce(tensors);
     works.push_back(std::move(work));
   }
 
@@ -88,7 +87,7 @@ void testAllreduce(int iter = 1000) {
 
 void testBroadcast(int iter = 10000) {
   auto pg = c10d::ProcessGroupMPI::createProcessGroupMPI();
-  std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> works;
+  std::vector<c10::intrusive_ptr<::c10d::Work>> works;
   for (const auto i : c10::irange(iter)) {
     auto tensors = std::vector<at::Tensor>();
     if (pg->getRank() == 0) {
@@ -100,8 +99,7 @@ void testBroadcast(int iter = 10000) {
     }
 
     // Queue the work.
-    c10::intrusive_ptr<::c10d::ProcessGroup::Work> work =
-        pg->broadcast(tensors);
+    c10::intrusive_ptr<::c10d::Work> work = pg->broadcast(tensors);
     works.push_back(std::move(work));
   }
 
@@ -121,13 +119,13 @@ void testBroadcast(int iter = 10000) {
 
 void testReduce(int iter = 10000) {
   auto pg = c10d::ProcessGroupMPI::createProcessGroupMPI();
-  std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> works;
+  std::vector<c10::intrusive_ptr<::c10d::Work>> works;
   for (const auto i : c10::irange(iter)) {
     auto tensor = at::ones({16, 16}) * i;
     auto tensors = std::vector<at::Tensor>({tensor});
 
     // Queue the work.
-    c10::intrusive_ptr<::c10d::ProcessGroup::Work> work = pg->reduce(tensors);
+    c10::intrusive_ptr<::c10d::Work> work = pg->reduce(tensors);
     works.push_back(std::move(work));
   }
 
@@ -152,7 +150,7 @@ void testReduce(int iter = 10000) {
 
 void testAllgather(int iter = 10000) {
   auto pg = c10d::ProcessGroupMPI::createProcessGroupMPI();
-  std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> works;
+  std::vector<c10::intrusive_ptr<::c10d::Work>> works;
 
   // Get the world size
   auto worldSize = pg->getSize();
@@ -169,8 +167,7 @@ void testAllgather(int iter = 10000) {
     }
 
     // Queue the work.
-    c10::intrusive_ptr<::c10d::ProcessGroup::Work> work =
-        pg->allgather(outputs, tensors);
+    c10::intrusive_ptr<::c10d::Work> work = pg->allgather(outputs, tensors);
     works.push_back(std::move(work));
   }
 
@@ -192,7 +189,7 @@ void testAllgather(int iter = 10000) {
 
 void testGather(int iter = 10000) {
   auto pg = c10d::ProcessGroupMPI::createProcessGroupMPI();
-  std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> works;
+  std::vector<c10::intrusive_ptr<::c10d::Work>> works;
 
   // Get the world size
   auto worldSize = pg->getSize();
@@ -212,8 +209,7 @@ void testGather(int iter = 10000) {
     }
 
     // Queue the work.
-    c10::intrusive_ptr<::c10d::ProcessGroup::Work> work =
-        pg->gather(outputs, tensors);
+    c10::intrusive_ptr<::c10d::Work> work = pg->gather(outputs, tensors);
     works.push_back(std::move(work));
   }
 
@@ -243,7 +239,7 @@ void testGather(int iter = 10000) {
 
 void testScatter(int iter = 1) {
   auto pg = c10d::ProcessGroupMPI::createProcessGroupMPI();
-  std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> works;
+  std::vector<c10::intrusive_ptr<::c10d::Work>> works;
 
   // Get the world size
   auto worldSize = pg->getSize();
@@ -263,8 +259,7 @@ void testScatter(int iter = 1) {
     }
 
     // Queue the work.
-    c10::intrusive_ptr<::c10d::ProcessGroup::Work> work =
-        pg->scatter(tensors, inputs);
+    c10::intrusive_ptr<::c10d::Work> work = pg->scatter(tensors, inputs);
     works.push_back(std::move(work));
   }
 
@@ -287,7 +282,7 @@ void testScatter(int iter = 1) {
 void testSendRecv(bool recvAnysource, int iter = 10000) {
   auto pg = c10d::ProcessGroupMPI::createProcessGroupMPI();
   // Generate inputs
-  std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> works;
+  std::vector<c10::intrusive_ptr<::c10d::Work>> works;
 
   // pg->send does not keep sent tensors alive, so we need to.
   std::vector<std::vector<at::Tensor>> sendTensors(iter);
@@ -298,8 +293,7 @@ void testSendRecv(bool recvAnysource, int iter = 10000) {
       sendTensors[i] = std::vector<at::Tensor>({tensor});
 
       // Queue the work.
-      c10::intrusive_ptr<::c10d::ProcessGroup::Work> work =
-          pg->send(sendTensors[i], 1, 0);
+      c10::intrusive_ptr<::c10d::Work> work = pg->send(sendTensors[i], 1, 0);
       works.push_back(std::move(work));
     } else {
       auto tensor = at::zeros({16, 16});
@@ -307,11 +301,10 @@ void testSendRecv(bool recvAnysource, int iter = 10000) {
 
       // Queue the work.
       if (!recvAnysource) {
-        c10::intrusive_ptr<::c10d::ProcessGroup::Work> work =
-            pg->recv(recvTensors, 0, 0);
+        c10::intrusive_ptr<::c10d::Work> work = pg->recv(recvTensors, 0, 0);
         works.push_back(std::move(work));
       } else {
-        c10::intrusive_ptr<::c10d::ProcessGroup::Work> work =
+        c10::intrusive_ptr<::c10d::Work> work =
             pg->recvAnysource(recvTensors, 0);
         works.push_back(std::move(work));
       }
diff --git a/test/cpp/c10d/ProcessGroupNCCLTest.cpp b/test/cpp/c10d/ProcessGroupNCCLTest.cpp
index 20da3bef86db..c2832a017099 100644
--- a/test/cpp/c10d/ProcessGroupNCCLTest.cpp
+++ b/test/cpp/c10d/ProcessGroupNCCLTest.cpp
@@ -92,7 +92,7 @@ class NCCLTest : public NCCLTestBase {
   }
 
   void wait(
-      c10::intrusive_ptr<c10d::ProcessGroup::Work>& work,
+      c10::intrusive_ptr<c10d::Work>& work,
       std::chrono::milliseconds timeout = kNoTimeout) {
     c10::cuda::CUDAMultiStreamGuard guard(streams_);
     work->wait(timeout);
@@ -177,7 +177,7 @@ class AllreduceNCCLTest : public NCCLTest {
   AllreduceNCCLTest(const std::string& path, int worldSize)
       : NCCLTest(path, worldSize) {}
 
-  c10::intrusive_ptr<c10d::ProcessGroup::Work> run() {
+  c10::intrusive_ptr<c10d::Work> run() {
     // For the duration of this function, make THC use our streams
     c10::cuda::CUDAMultiStreamGuard guard(streams_);
 
@@ -200,9 +200,7 @@ class BroadcastNCCLTest : public NCCLTest {
   BroadcastNCCLTest(const std::string& path, int worldSize)
       : NCCLTest(path, worldSize) {}
 
-  c10::intrusive_ptr<c10d::ProcessGroup::Work> run(
-      int rootRank,
-      int rootTensor) {
+  c10::intrusive_ptr<c10d::Work> run(int rootRank, int rootTensor) {
     // For the duration of this function, make THC use our streams
     c10::cuda::CUDAMultiStreamGuard guard(streams_);
 
@@ -221,9 +219,7 @@ class ReduceNCCLTest : public NCCLTest {
   ReduceNCCLTest(const std::string& path, int worldSize)
       : NCCLTest(path, worldSize) {}
 
-  c10::intrusive_ptr<c10d::ProcessGroup::Work> run(
-      int rootRank,
-      int rootTensor) {
+  c10::intrusive_ptr<c10d::Work> run(int rootRank, int rootTensor) {
     // For the duration of this function, make THC use our streams
     c10::cuda::CUDAMultiStreamGuard guard(streams_);
 
@@ -242,7 +238,7 @@ class AllgatherNCCLTest : public NCCLTest {
   AllgatherNCCLTest(const std::string& path, int worldSize)
       : NCCLTest(path, worldSize) {}
 
-  c10::intrusive_ptr<c10d::ProcessGroup::Work> run() {
+  c10::intrusive_ptr<c10d::Work> run() {
     // For the duration of this function, make THC use our streams
     c10::cuda::CUDAMultiStreamGuard guard(streams_);
 
@@ -260,7 +256,7 @@ class AllgatherBaseNCCLTest : public NCCLTest {
     output_tensor_ = at::empty({worldSize_, 3, 3}, at::kCUDA);
   }
 
-  c10::intrusive_ptr<c10d::ProcessGroup::Work> run() {
+  c10::intrusive_ptr<c10d::Work> run() {
     // For the duration of this function, make THC use our streams
     c10::cuda::CUDAMultiStreamGuard guard(streams_);
 
@@ -290,7 +286,7 @@ struct ReduceScatterNCCLTest : NCCLTest {
   ReduceScatterNCCLTest(const std::string& path, int worldSize)
       : NCCLTest(path, worldSize) {}
 
-  c10::intrusive_ptr<c10d::ProcessGroup::Work> run() {
+  c10::intrusive_ptr<c10d::Work> run() {
     // For the duration of this function, make THC use our streams
     c10::cuda::CUDAMultiStreamGuard guard(streams_);
 
@@ -321,7 +317,7 @@ class ReduceScatterBaseNCCLTest : public NCCLTest {
     }
   }
 
-  c10::intrusive_ptr<c10d::ProcessGroup::Work> run() {
+  c10::intrusive_ptr<c10d::Work> run() {
     // For the duration of this function, make THC use our streams
     at::cuda::CUDAMultiStreamGuard guard(streams_);
 
diff --git a/test/cpp/c10d/example/allreduce.cpp b/test/cpp/c10d/example/allreduce.cpp
index 80dfe7ac47f6..20a49f1cf19f 100644
--- a/test/cpp/c10d/example/allreduce.cpp
+++ b/test/cpp/c10d/example/allreduce.cpp
@@ -20,7 +20,7 @@ int main(int argc, char** argv) {
   }
 
   // Kick off work
-  std::vector<c10::intrusive_ptr<ProcessGroup::Work>> pending;
+  std::vector<c10::intrusive_ptr<Work>> pending;
   for (const auto i : c10::irange(ntensors)) {
     std::vector<at::Tensor> tmp = {tensors[i]};
     pending.push_back(pg.allreduce(tmp));
diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
index 34350c155a9c..66a60fb01caf 100644
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@@ -96,6 +96,9 @@ set(JIT_TEST_SRCS
 )
 
 if(USE_CUDA)
+  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_definition.cpp)
+  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_cache.cpp)
+  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_record.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_tensor_factories.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp)
diff --git a/test/cpp/lazy/CMakeLists.txt b/test/cpp/lazy/CMakeLists.txt
index 4d98400323fb..31ef66635346 100644
--- a/test/cpp/lazy/CMakeLists.txt
+++ b/test/cpp/lazy/CMakeLists.txt
@@ -9,7 +9,6 @@ set(LAZY_TEST_SRCS
   ${LAZY_TEST_ROOT}/test_misc.cpp
   ${LAZY_TEST_ROOT}/test_permutation_util.cpp
   ${LAZY_TEST_ROOT}/test_shape.cpp
-  ${LAZY_TEST_ROOT}/test_symbolic_shape.cpp
   ${LAZY_TEST_ROOT}/test_trie_cache.cpp
   ${LAZY_TEST_ROOT}/test_util.cpp
 )
diff --git a/test/cpp/lazy/test_lazy_ops.cpp b/test/cpp/lazy/test_lazy_ops.cpp
index 6198940b3100..d4b0643fea8b 100644
--- a/test/cpp/lazy/test_lazy_ops.cpp
+++ b/test/cpp/lazy/test_lazy_ops.cpp
@@ -1,8 +1,6 @@
-#include <gtest/gtest.h>
-#include <iostream>
-#include "c10/core/DeviceType.h"
-
 #include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <gtest/gtest.h>
 #include <test/cpp/lazy/test_lazy_ops_util.h>
 #include <torch/csrc/lazy/core/debug_util.h>
 #include <torch/csrc/lazy/core/helpers.h>
@@ -13,6 +11,7 @@
 #include <torch/csrc/lazy/ts_backend/dynamic_ir.h>
 #include <torch/csrc/lazy/ts_backend/ts_backend_impl.h>
 #include <torch/torch.h>
+#include <iostream>
 
 namespace torch {
 namespace lazy {
diff --git a/test/cpp/lazy/test_symbolic_shape.cpp b/test/cpp/lazy/test_symbolic_shape.cpp
deleted file mode 100644
index ef344c7f6d92..000000000000
--- a/test/cpp/lazy/test_symbolic_shape.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-
-#include <c10/core/Device.h>
-#include <gtest/gtest.h>
-#include <test/cpp/lazy/test_lazy_ops_util.h>
-#include <torch/csrc/lazy/core/debug_util.h>
-#include <torch/csrc/lazy/core/helpers.h>
-#include <torch/csrc/lazy/core/ir_builder.h>
-#include <torch/csrc/lazy/core/lazy_graph_executor.h>
-#include <torch/csrc/lazy/core/metrics.h>
-#include <torch/csrc/lazy/core/permutation_util.h>
-#include <torch/csrc/lazy/core/tensor.h>
-#include <torch/csrc/lazy/ts_backend/ts_backend_impl.h>
-#include <torch/torch.h>
-#include <iostream>
-
-namespace torch {
-namespace lazy {
-
-// Lazy Tensor is disabled in FBCODE until addressing non-virtual methods (e.g.
-// sizes) in TensorImpl
-#ifndef FBCODE_CAFFE2
-
-namespace {
-// This registers the torchscript backend, without which lazy device won't work
-torch::lazy::BackendRegistrar g_registrar(GetTSBackendImpl());
-
-static inline at::DeviceType DefaultDevice() {
-  return torch::lazy::getBackend()->EagerFallbackDeviceType();
-}
-
-std::vector<bool> getIsSymbolic(at::Tensor& lazy_tensor) {
-  auto ltc_tensor = GetLtcTensor(lazy_tensor);
-  Value ir_val = ltc_tensor->GetIrValue();
-  const Shape& shape = ir_val->shape();
-  return shape.is_symbolic().value();
-}
-
-class LazyShapeTest : public ::testing::Test {
- protected:
-  static void SetUpTestCase() {}
-  void SetUp() override {
-    at::manual_seed(42);
-    torch::lazy::LazyGraphExecutor::Get()->SetRngSeed(
-        torch::lazy::BackendDevice(), 42);
-    FLAGS_ltc_enable_symbolic_shapes = true;
-  }
-  void TearDown() override {
-    FLAGS_ltc_enable_symbolic_shapes = false;
-  }
-};
-
-class DynamicInputShapeNode : public Node {
- public:
-  explicit DynamicInputShapeNode(Shape& shape)
-      : Node(OpKind(), /* num_outputs */ 1), hash_(0), shape_(shape) {}
-  ~DynamicInputShapeNode() override = default;
-
-  const std::vector<Output>& operands() const override {
-    TORCH_INTERNAL_ASSERT(false, "Can't access operands of test node");
-  }
-
-  const Output& operand(size_t i) const override {
-    TORCH_INTERNAL_ASSERT(false, "Can't access operand[i] of test node");
-  }
-  const Shape& shape(size_t i) const override {
-    return shape_;
-  }
-  c10::ArrayRef<Shape> shapes() const override {
-    return {shape_};
-  }
-
-  hash_t hash() const override {
-    return hash_;
-  }
-  hash_t shapeHash() const override {
-    return hash_;
-  }
-
- private:
-  hash_t hash_;
-  Shape shape_;
-};
-
-} // namespace
-
-Tensor tensorWithSymbolicShape(
-    const std::vector<int64_t>& sizes,
-    const std::vector<bool>& is_symbolic) {
-  Shape shape = Shape(torch::kFloat32, sizes);
-  Shape shape_with_symbolic = shape.with_symbolic_dims(is_symbolic);
-  auto n = torch::lazy::MakeNode<DynamicInputShapeNode>(shape_with_symbolic);
-  auto device = BackendDevice();
-  auto lt = torch::lazy::LazyTensor::Create(n, device);
-  return torch::lazy::CreateAtenFromLtcTensor(lt);
-}
-
-TEST_F(LazyShapeTest, TestMulBasic) {
-  // Basic propagation
-  torch::Tensor a = tensorWithSymbolicShape({2, 2}, {true, false});
-  torch::Tensor b = tensorWithSymbolicShape({2, 2}, {true, false});
-  torch::Tensor res = torch::mul(a, b);
-
-  std::vector<bool> expected = {true, false};
-  EXPECT_EQ(getIsSymbolic(res), expected);
-
-  // Test when some inputs are symbolic
-  a = tensorWithSymbolicShape({2, 2}, {true, true});
-  b = tensorWithSymbolicShape({2, 2}, {true, false});
-  res = torch::mul(a, b);
-
-  // This is not {true, false}, as the SSA shape propagation
-  // is not able to simplify
-  // expandedSizes.append(sizeB if sizeA == 1 else sizeA)
-  // in broadcast() in shape_functions_1.h
-  // due to sizeA being symbolic
-  expected = {true, true};
-  EXPECT_EQ(getIsSymbolic(res), expected);
-
-  // Test correct handling of broadcasting dim
-  a = tensorWithSymbolicShape({2, 2}, {false, true});
-  b = tensorWithSymbolicShape({2, 1}, {true, false});
-  res = torch::mul(a, b);
-
-  expected = {false, true};
-  EXPECT_EQ(getIsSymbolic(res), expected);
-
-  // Test correct handling of scalar values
-  a = tensorWithSymbolicShape({2, 2}, {false, true});
-  res = torch::mul(a, 3);
-  expected = {false, true};
-  EXPECT_EQ(getIsSymbolic(res), expected);
-};
-
-TEST_F(LazyShapeTest, TestCatBasic) {
-  // Basic propagation
-  torch::Tensor a = tensorWithSymbolicShape({2, 2}, {true, false});
-  torch::Tensor b = tensorWithSymbolicShape({2, 2}, {true, false});
-  torch::Tensor c = tensorWithSymbolicShape({2, 2}, {true, false});
-
-  auto res = torch::cat({a, b, c}, 1);
-  std::vector<bool> expected = {true, false};
-  EXPECT_EQ(getIsSymbolic(res), expected);
-
-  torch::Tensor d = tensorWithSymbolicShape({2, 2}, {false, true});
-  res = torch::cat({a, d}, 0);
-  expected = {true, false};
-  EXPECT_EQ(getIsSymbolic(res), expected);
-
-  // Test handling of symbolic dims of inequal sizes, Currently crashes
-  // As we can't handle cases where upper bound dims are not equal
-  /*
-  torch::Tensor e = tensorWithSymbolicShape({2, 2}, {true, false});
-  torch::Tensor f = tensorWithSymbolicShape({2, 3}, {false, true});
-  res = torch::cat({e, f}, 0);
-  expected = {true, false};
-  EXPECT_EQ(getIsSymbolic(res), expected);
-  */
-}
-#endif // FBCODE_CAFFE2
-} // namespace lazy
-} // namespace torch
diff --git a/test/cpp/tensorexpr/test_quantization.cpp b/test/cpp/tensorexpr/test_quantization.cpp
index 96a21e0e07bf..a34a6e7bd7bd 100644
--- a/test/cpp/tensorexpr/test_quantization.cpp
+++ b/test/cpp/tensorexpr/test_quantization.cpp
@@ -103,7 +103,8 @@ TEST_F(Quantization, QuantDequantUInt8_NLC) {
   parseIR(graph_string, &*graph);
 
   auto x = 2 * at::rand({1, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat));
-  x.unsafeGetTensorImpl()->set_sizes_and_strides({1, 2, 2}, {4, 1, 2});
+  x.unsafeGetTensorImpl()->set_sizes_and_strides(
+      std::initializer_list<int64_t>{1, 2, 2}, {4, 1, 2});
   auto q = at::quantize_per_tensor(x, 0.1f, 122, at::kQUInt8);
   auto y_expected = at::dequantize(q);
   TensorExprKernel k(graph);
diff --git a/test/cpp_extensions/cpp_c10d_extension.cpp b/test/cpp_extensions/cpp_c10d_extension.cpp
index b60ba75e8290..caf03f5bc917 100644
--- a/test/cpp_extensions/cpp_c10d_extension.cpp
+++ b/test/cpp_extensions/cpp_c10d_extension.cpp
@@ -23,85 +23,85 @@ ProcessGroupTest::ProcessGroupTest(int rank, int size)
 
 ProcessGroupTest::~ProcessGroupTest() {}
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::broadcast(
+c10::intrusive_ptr<Work> ProcessGroupTest::broadcast(
     std::vector<at::Tensor>& tensors,
     const BroadcastOptions& opts) {
   return c10::make_intrusive<ProcessGroupTest::WorkTest>();
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::allreduce(
+c10::intrusive_ptr<Work> ProcessGroupTest::allreduce(
     std::vector<at::Tensor>& tensors,
     const AllreduceOptions& opts) {
   return c10::make_intrusive<ProcessGroupTest::WorkTest>();
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::allreduce_coalesced(
+c10::intrusive_ptr<Work> ProcessGroupTest::allreduce_coalesced(
       std::vector<at::Tensor>& tensors,
       const AllreduceCoalescedOptions& opts) {
   throw std::runtime_error("ProcessGroupTest does not support allreduce_coalesced");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::reduce(
+c10::intrusive_ptr<Work> ProcessGroupTest::reduce(
     std::vector<at::Tensor>& tensors,
     const ReduceOptions& opts) {
   throw std::runtime_error("ProcessGroupTest does not support reduce");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::allgather(
+c10::intrusive_ptr<Work> ProcessGroupTest::allgather(
     std::vector<std::vector<at::Tensor>>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const AllgatherOptions& opts) {
   throw std::runtime_error("ProcessGroupTest does not support allgather");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::_allgather_base(
+c10::intrusive_ptr<Work> ProcessGroupTest::_allgather_base(
     at::Tensor& outputBuffer,
     at::Tensor& inputBuffer,
     const AllgatherOptions& opts) {
   throw std::runtime_error("ProcessGroupTest does not support _allgather_base");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::barrier(
+c10::intrusive_ptr<Work> ProcessGroupTest::barrier(
     const BarrierOptions& opts) {
   return c10::make_intrusive<ProcessGroupTest::WorkTest>();
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::gather(
+c10::intrusive_ptr<Work> ProcessGroupTest::gather(
     std::vector<std::vector<at::Tensor>>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const GatherOptions& opts) {
   throw std::runtime_error("ProcessGroupTest does not support gather");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::scatter(
+c10::intrusive_ptr<Work> ProcessGroupTest::scatter(
     std::vector<at::Tensor>& outputTensors,
     std::vector<std::vector<at::Tensor>>& inputTensors,
     const ScatterOptions& opts) {
   throw std::runtime_error("ProcessGroupTest does not support scatter");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::reduce_scatter(
+c10::intrusive_ptr<Work> ProcessGroupTest::reduce_scatter(
     std::vector<at::Tensor>& outputTensors,
     std::vector<std::vector<at::Tensor>>& inputTensors,
     const ReduceScatterOptions& opts) {
   throw std::runtime_error("ProcessGroupTest does not support reduce_scatter");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::send(
+c10::intrusive_ptr<Work> ProcessGroupTest::send(
     std::vector<at::Tensor>& tensors,
     int dstRank,
     int tag) {
   throw std::runtime_error("ProcessGroupTest does not support send");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::recv(
+c10::intrusive_ptr<Work> ProcessGroupTest::recv(
     std::vector<at::Tensor>& tensors,
     int srcRank,
     int tag) {
   throw std::runtime_error("ProcessGroupTest does not support recv");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::recvAnysource(
+c10::intrusive_ptr<Work> ProcessGroupTest::recvAnysource(
     std::vector<at::Tensor>& tensor,
     int tag) {
   throw std::runtime_error("ProcessGroupTest does not support recvAnysource");
diff --git a/test/cpp_extensions/cpp_c10d_extension.hpp b/test/cpp_extensions/cpp_c10d_extension.hpp
index 6d4a0c598200..f214e2d4d182 100644
--- a/test/cpp_extensions/cpp_c10d_extension.hpp
+++ b/test/cpp_extensions/cpp_c10d_extension.hpp
@@ -13,6 +13,7 @@
 #include <pybind11/chrono.h>
 
 #include <c10d/ProcessGroup.hpp>
+#include <c10d/Work.hpp>
 #include <c10d/Store.hpp>
 #include <c10d/Types.hpp>
 #include <c10d/Utils.hpp>
@@ -25,7 +26,7 @@ namespace c10d {
 
 class ProcessGroupTest : public ProcessGroup {
  public:
-  class WorkTest : public ProcessGroup::Work {
+  class WorkTest : public Work {
    public:
     WorkTest() {}
 
@@ -41,61 +42,61 @@ class ProcessGroupTest : public ProcessGroup {
   explicit ProcessGroupTest(int rank = -1, int size = -1);
   virtual ~ProcessGroupTest();
 
-  c10::intrusive_ptr<ProcessGroup::Work> broadcast(
+  c10::intrusive_ptr<Work> broadcast(
       std::vector<at::Tensor>& data,
       const BroadcastOptions& opts = BroadcastOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allreduce(
+  c10::intrusive_ptr<Work> allreduce(
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allreduce_coalesced(
+  c10::intrusive_ptr<Work> allreduce_coalesced(
       std::vector<at::Tensor>& tensors,
       const AllreduceCoalescedOptions& opts = AllreduceCoalescedOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> reduce(
+  c10::intrusive_ptr<Work> reduce(
       std::vector<at::Tensor>& tensors,
       const ReduceOptions& opts = ReduceOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allgather(
+  c10::intrusive_ptr<Work> allgather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> _allgather_base(
+  c10::intrusive_ptr<Work> _allgather_base(
       at::Tensor& outputBuffer,
       at::Tensor& inputBuffer,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> barrier(
+  c10::intrusive_ptr<Work> barrier(
       const BarrierOptions& opts = BarrierOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> gather(
+  c10::intrusive_ptr<Work> gather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const GatherOptions& opts = GatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> scatter(
+  c10::intrusive_ptr<Work> scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ScatterOptions& opts = ScatterOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(
+  c10::intrusive_ptr<Work> reduce_scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> send(
+  c10::intrusive_ptr<Work> send(
       std::vector<at::Tensor>& tensors,
       int dstRank,
       int tag) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> recv(
+  c10::intrusive_ptr<Work> recv(
       std::vector<at::Tensor>& tensors,
       int srcRank,
       int tag) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> recvAnysource(
+  c10::intrusive_ptr<Work> recvAnysource(
       std::vector<at::Tensor>& tensor,
       int tag) override;
 
diff --git a/test/distributed/fsdp/test_checkpoint_wrapper.py b/test/distributed/fsdp/test_checkpoint_wrapper.py
index b699a3821936..d2332a8faafd 100644
--- a/test/distributed/fsdp/test_checkpoint_wrapper.py
+++ b/test/distributed/fsdp/test_checkpoint_wrapper.py
@@ -54,6 +54,58 @@ def test_load_activation_checkpointed_module(self):
         for p1, p2 in zip(lin.parameters(), lin_new.parameters()):
             self.assertEqual(p1, p2)
 
+    def test_checkpoint_wrapper_kwarg_support(self):
+        class MyModel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.lin = nn.Linear(10, 10)
+
+            def forward(self, a, b, c=None, d=None, **kwargs):
+                return (
+                    self.lin(a),
+                    self.lin(b),
+                    self.lin(c),
+                    self.lin(d)
+                )
+
+
+        for wrapper in [
+            partial(checkpoint_wrapper, checkpoint_impl=CheckpointImpl.REENTRANT),
+            partial(checkpoint_wrapper, checkpoint_impl=CheckpointImpl.NO_REENTRANT),
+            partial(checkpoint_wrapper, offload_to_cpu=True),
+        ]:
+            with self.subTest(wrapper=wrapper):
+                model = wrapper(MyModel())
+                self.assertTrue(isinstance(model, CheckpointWrapper))
+                # Verify kwargs can be passed in
+                inp = torch.ones(4, 10, requires_grad=True)
+                out = model(inp, inp, c=inp, d=inp, e=inp, f=inp)
+                self.assertTrue(isinstance(out, tuple))
+                self.assertEqual(4, len(out))
+                # Without kwargs should have equivalent gradient requirements.
+                out_no_kwarg = model(inp, inp, inp, inp)
+                for t1, t2 in zip(out_no_kwarg, out):
+                    self.assertEqual(t1, t2)
+                    self.assertEqual(t1.requires_grad, t2.requires_grad)
+
+        # Test model that enforces kwarg inputs
+        class ModelEnforceKwarg(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.lin = nn.Linear(10, 10)
+
+            def forward(self, *, a=None, b=None):
+                return (self.lin(a), self.lin(b))
+
+        model = checkpoint_wrapper(
+            ModelEnforceKwarg(), checkpoint_impl=CheckpointImpl.REENTRANT
+        )
+
+        inp = torch.ones(4, 10, requires_grad=True)
+        out = model(a=inp, b=inp)
+        self.assertEqual(2, len(out))
+
+
     @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA")
     def test_checkpoint_wrapper_parity(self):
         """
@@ -150,37 +202,49 @@ def __init__(self):
             def forward(self, x):
                 return self.seq(x)
 
-        model = MyModel()
-        n_linear = sum(1 if isinstance(x, nn.Linear) else 0 for x in model.modules())
 
         def check_fn(l):
             return isinstance(l, nn.Linear)
 
-        apply_activation_checkpointing_wrapper(
-            model, checkpoint_wrapper_fn=checkpoint_wrapper, check_fn=check_fn
-        )
-        n_linear_wrapped = sum(1 if isinstance(x, nn.Linear) else 0 for x in model.modules())
-        n_checkpointed = sum(1 if isinstance(x, CheckpointWrapper) else 0 for x in model.modules())
-        self.assertEqual(n_checkpointed, n_linear_wrapped)
-        self.assertEqual(n_linear, n_linear_wrapped)
-        for j in range(3):
-            self.assertTrue(isinstance(model.seq[j].lin, CheckpointWrapper))
-            self.assertTrue(isinstance(model.seq[j].nested_linear[0], CheckpointWrapper))
-
-        inp = torch.randn(4, 10, requires_grad=True)
-        for i in range(6):
-            loss = model(inp).sum()
-            self.assertTrue(loss.requires_grad)
-            loss.backward()
-            # ensure checkpointed part of model has gradients
-            for j in range(3):
-                weight_lin = model.seq[j].lin._checkpoint_wrapped_module.weight
-                bias_lin = model.seq[j].lin._checkpoint_wrapped_module.bias
-                weight_nested_lin = model.seq[j].nested_linear[0]._checkpoint_wrapped_module.weight
-                bias_nested_lin = model.seq[j].nested_linear[0]._checkpoint_wrapped_module.bias
-                for param in [weight_lin, bias_lin, weight_nested_lin, bias_nested_lin]:
-                    self.assertTrue(param.requires_grad)
-                    self.assertFalse(param.grad is None)
+        n_linear = None
+
+        for wrapper in [
+            partial(checkpoint_wrapper, checkpoint_impl=CheckpointImpl.REENTRANT),
+            partial(checkpoint_wrapper, checkpoint_impl=CheckpointImpl.NO_REENTRANT),
+        ]:
+            model = MyModel()
+            if n_linear is None:
+                n_linear = sum(
+                    1 if isinstance(x, nn.Linear) else 0 for x in model.modules()
+                )
+
+            with self.subTest(wrapper=wrapper):
+                apply_activation_checkpointing_wrapper(
+                    model, checkpoint_wrapper_fn=wrapper, check_fn=check_fn
+                )
+                n_linear_wrapped = sum(1 if isinstance(x, nn.Linear) else 0 for x in model.modules())
+                n_checkpointed = sum(1 if isinstance(x, CheckpointWrapper) else 0 for x in model.modules())
+                self.assertEqual(n_checkpointed, n_linear_wrapped)
+                self.assertEqual(n_linear, n_linear_wrapped)
+                for j in range(3):
+                    self.assertTrue(isinstance(model.seq[j].lin, CheckpointWrapper))
+                    self.assertTrue(isinstance(model.seq[j].nested_linear[0], CheckpointWrapper))
+
+                inp = torch.randn(4, 10, requires_grad=True)
+                for i in range(6):
+                    # Kwarg input
+                    loss = model(x=inp).sum()
+                    self.assertTrue(loss.requires_grad)
+                    loss.backward()
+                    # ensure checkpointed part of model has gradients
+                    for j in range(3):
+                        weight_lin = model.seq[j].lin._checkpoint_wrapped_module.weight
+                        bias_lin = model.seq[j].lin._checkpoint_wrapped_module.bias
+                        weight_nested_lin = model.seq[j].nested_linear[0]._checkpoint_wrapped_module.weight
+                        bias_nested_lin = model.seq[j].nested_linear[0]._checkpoint_wrapped_module.bias
+                        for param in [weight_lin, bias_lin, weight_nested_lin, bias_nested_lin]:
+                            self.assertTrue(param.requires_grad)
+                            self.assertFalse(param.grad is None)
 
     def test_fqn(self):
         lin = nn.Linear(10, 10, bias=False)
@@ -189,5 +253,57 @@ def test_fqn(self):
         for fqn, _ in lin.named_parameters():
             self.assertTrue(fqn in state_dict, msg=f"{fqn} not in state_dict.")
 
+    @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA")
+    def test_checkpoint_wrapper_cpu_offload(self):
+        model = nn.Sequential(
+            nn.Linear(10, 10),
+            nn.Linear(10, 10),
+            nn.Linear(10, 10),
+        ).cuda()
+
+        # Patch saved_tensor_hooks to make the unpack keep the tensor on CPU for
+        # testing, otherwise the tensor access during the DFS will cause orig
+        # unpack to run, transferring the tensor back to GPU.
+        def patched_init(saved_tensor_hook_obj, pack_hook, _):
+            saved_tensor_hook_obj.pack_hook = pack_hook
+
+            def testing_cpu_offload_unpack_hook(packed):
+                _, tensor = packed
+                return tensor
+
+            saved_tensor_hook_obj.unpack_hook = testing_cpu_offload_unpack_hook
+
+        orig_init = torch.autograd.graph.saved_tensors_hooks.__init__
+        torch.autograd.graph.saved_tensors_hooks.__init__ = patched_init
+
+        model = checkpoint_wrapper(model, offload_to_cpu=True)
+
+        inp = torch.randn(3, 10, device='cuda')
+        loss = model(inp).sum()
+
+        # All autograd saved tensors should be offloaded to CPU.
+        offload_verified = False
+
+        def dfs(grad_fn):
+            for e in dir(grad_fn):
+                if not e.startswith('_saved_'):
+                    continue
+
+                saved = getattr(grad_fn, e)
+                if isinstance(saved, torch.Tensor):
+                    self.assertEqual(torch.device("cpu"), saved.device)
+                    nonlocal offload_verified
+                    offload_verified = True
+
+            if hasattr(grad_fn, 'next_functions'):
+                for next_grad_fn, _ in grad_fn.next_functions:
+                    dfs(next_grad_fn)
+
+        dfs(loss.grad_fn)
+
+        self.assertTrue(offload_verified)
+
+        torch.autograd.graph.saved_tensors_hooks.__init__ = orig_init
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_flatten_params_wrapper.py b/test/distributed/fsdp/test_flatten_params_wrapper.py
index 56f7bc456485..43662140074c 100644
--- a/test/distributed/fsdp/test_flatten_params_wrapper.py
+++ b/test/distributed/fsdp/test_flatten_params_wrapper.py
@@ -5,7 +5,11 @@
 
 import torch
 from torch import distributed as dist
-from torch.distributed.fsdp.flat_param import FlatParamShardMetadata
+from torch.distributed.fsdp.flat_param import (
+    FlatParamShardMetadata,
+    HandleConfig,
+    HandleShardingStrategy,
+)
 from torch.distributed.fsdp.flatten_params_wrapper import FlattenParamsWrapper
 from torch.testing._internal.common_utils import TestCase, run_tests
 
@@ -17,6 +21,9 @@
 class TestFlattenParams(TestCase):
     """Base test class and used for CPU case."""
 
+    def _get_default_config(self):
+        return HandleConfig(HandleShardingStrategy.FULL_SHARD, False, None, None)
+
     def _get_empty_module(self, seed=0):
         torch.manual_seed(seed)  # keep everything deterministic
 
@@ -78,7 +85,12 @@ def _test_num_params(self, module):
         ref_num_params = sum(p.numel() for p in module.parameters())
 
         params_to_flatten = list(module.parameters())
-        flat_module = FlattenParamsWrapper(module, params_to_flatten)
+        flat_module = FlattenParamsWrapper(
+            module,
+            params_to_flatten,
+            torch.device("cuda"),
+            self._get_default_config(),
+        )
         flat_num_params = sum(p.numel() for p in flat_module.parameters())
 
         self.assertEqual(ref_num_params, flat_num_params)
@@ -88,7 +100,12 @@ def _test_output(self, module):
         ref_output = self._get_output(module)
 
         params_to_flatten = list(module.parameters())
-        flat_module = FlattenParamsWrapper(module, params_to_flatten)
+        flat_module = FlattenParamsWrapper(
+            module,
+            params_to_flatten,
+            torch.device("cuda"),
+            self._get_default_config(),
+        )
         flat_output = self._get_output(flat_module)
         self.assertEqual(ref_output, flat_output)
 
@@ -101,7 +118,12 @@ def test_partial_flattening(self):
         )
         num_params_to_flatten = sum(p.numel() for p in params_to_flatten)
 
-        module = FlattenParamsWrapper(module, params_to_flatten)
+        module = FlattenParamsWrapper(
+            module,
+            params_to_flatten,
+            torch.device("cuda"),
+            self._get_default_config(),
+        )
         self.assertEqual(module.flat_param.numel(), num_params_to_flatten)
         self.assertEqual(sum(p.numel() for p in module.parameters()), num_params)
 
@@ -128,14 +150,24 @@ def test_partial_flattening(self):
 
     def test_flatten_nothing(self):
         module = self._get_transformer()
-        module = FlattenParamsWrapper(module, [])
+        module = FlattenParamsWrapper(
+            module,
+            [],
+            torch.device("cuda"),
+            self._get_default_config(),
+        )
         self.assertIsNone(module.flat_param)
 
     def test_empty_module(self):
         module = self._get_empty_module()
         in_data = torch.rand(1)
         ref_out = module(in_data)
-        module = FlattenParamsWrapper(module, [])
+        module = FlattenParamsWrapper(
+            module,
+            [],
+            torch.device("cuda"),
+            self._get_default_config(),
+        )
         self.assertEqual(len(list(module.parameters())), 0)
         self.assertIsNone(module.flat_param)
         fpw_out = module(in_data)
@@ -165,7 +197,12 @@ def test_shared_params_pnorm_after_step(self):
 
         module = self._get_shared_params_transformer()  # recreate
         params_to_flatten = list(module.parameters())
-        flat_module = FlattenParamsWrapper(module, params_to_flatten)
+        flat_module = FlattenParamsWrapper(
+            module,
+            params_to_flatten,
+            torch.device("cuda"),
+            self._get_default_config(),
+        )
         flat_pnorm_after_step = self._get_pnorm_after_step(flat_module)
 
         self.assertEqual(ref_pnorm_after_step, flat_pnorm_after_step)
@@ -180,7 +217,12 @@ def test_sharded_flat_param(self):
             torch.nn.ReLU(),
         )
         params_to_flatten = list(module.parameters())
-        flat_module = FlattenParamsWrapper(module, params_to_flatten)
+        flat_module = FlattenParamsWrapper(
+            module,
+            params_to_flatten,
+            torch.device("cuda"),
+            self._get_default_config(),
+        )
         flat_param_handle = flat_module.handle
 
         def _test(kwargs, expected):
@@ -196,7 +238,6 @@ def _test(kwargs, expected):
             rank and world size.
             """
             flat_param = flat_module.flat_param
-            flat_param._is_sharded = True
             flat_param._shard_param_offsets, flat_param._shard_indices = \
                 flat_param_handle._get_shard_metadata(kwargs["start"], kwargs["end"])
             self.assertEqual(
diff --git a/test/distributed/fsdp/test_fsdp_comm.py b/test/distributed/fsdp/test_fsdp_comm.py
index 432e56ac0359..590919fb2d68 100644
--- a/test/distributed/fsdp/test_fsdp_comm.py
+++ b/test/distributed/fsdp/test_fsdp_comm.py
@@ -149,9 +149,9 @@ def _get_ref_num_all_gathers_in_pass(
                 f"sharding_strategy={sharding_strategy}"
         if is_first_iter and pass_type == PassType.FWD:
             # With execution order validation, on the first iteration, we have
-            # an additional all-gather before every actual all-gather in the
-            # forward pass
-            num_all_gathers *= 2
+            # an additional two all-gathers before every actual all-gather in
+            # the forward pass
+            num_all_gathers *= 3
         return num_all_gathers
 
     def _print_ref_num_all_gathers_in_pass(
diff --git a/test/distributed/fsdp/test_fsdp_comm_hooks.py b/test/distributed/fsdp/test_fsdp_comm_hooks.py
index 93fdc02d456e..bfd710cdac48 100644
--- a/test/distributed/fsdp/test_fsdp_comm_hooks.py
+++ b/test/distributed/fsdp/test_fsdp_comm_hooks.py
@@ -169,8 +169,6 @@ def test_default_communication_hook_behavior(
 
             # For each worker, the gradient on the weight should be worker_rank.
             grad = net_default_hook.params[0].grad
-            if sharding_strategy != ShardingStrategy.NO_SHARD:
-                self.assertTrue(net_default_hook.params[0]._is_sharded, "Expected parameter to be a sharded chunk.")
             expected_grad = (
                 sum(i for i in range(dist.get_world_size())) / dist.get_world_size()
             )
diff --git a/test/distributed/fsdp/test_fsdp_core.py b/test/distributed/fsdp/test_fsdp_core.py
index 36dc19eeda80..c9a1a9df5019 100644
--- a/test/distributed/fsdp/test_fsdp_core.py
+++ b/test/distributed/fsdp/test_fsdp_core.py
@@ -84,7 +84,6 @@ def _get_subtest_config(self, cpu_offload: CPUOffload) -> Dict[str, List[Any]]:
         modes and prefetching settings together."""
         return {
             "cuda_init_mode": self._get_cuda_init_modes(cpu_offload),
-            "forward_prefetch": [False, True],
             "backward_prefetch": [
                 None,
                 BackwardPrefetch.BACKWARD_PRE,
diff --git a/test/distributed/fsdp/test_fsdp_exec_order.py b/test/distributed/fsdp/test_fsdp_exec_order.py
index d6675e930129..eaf3066d1bad 100644
--- a/test/distributed/fsdp/test_fsdp_exec_order.py
+++ b/test/distributed/fsdp/test_fsdp_exec_order.py
@@ -36,6 +36,7 @@ class Model(torch.nn.Module):
     when flattened, which means that their corresponding all-gathers and
     reduce-scatters may be silently matched if we do not perform any checks.
     """
+
     def __init__(self) -> None:
         super().__init__()
         self.layer0 = torch.nn.Linear(5, 6)
@@ -54,8 +55,11 @@ def forward(self, x):
         # `layer0` -> `layer1` (normal)
         # `layer0` -> `layer2` (alternate)
         z = self.relu(self.layer0(x))
-        z = self.relu(self.layer2(z)) if self.use_alt_path \
+        z = (
+            self.relu(self.layer2(z))
+            if self.use_alt_path
             else self.relu(self.layer1(z))
+        )
         return z
 
     def get_input(self, device: torch.device):
@@ -68,10 +72,12 @@ def run_backward(self, loss):
         loss.backward()
 
     def flip_path(self):
-        params_to_freeze = self.layer2.parameters() if self.use_alt_path \
-            else self.layer1.parameters()
-        params_to_unfreeze = self.layer1.parameters() if self.use_alt_path \
-            else self.layer2.parameters()
+        params_to_freeze = (
+            self.layer2.parameters() if self.use_alt_path else self.layer1.parameters()
+        )
+        params_to_unfreeze = (
+            self.layer1.parameters() if self.use_alt_path else self.layer2.parameters()
+        )
         for param in params_to_freeze:
             param.requires_grad = False
         for param in params_to_unfreeze:
@@ -106,6 +112,7 @@ def test_invalid_first_iter_order(
     ):
         """Tests that FSDP errors if the all-gather order differs across ranks
         in the first iteration."""
+        dist.set_debug_level(dist.DebugLevel.INFO)
         # Rank 0 runs the forward pass in one order and all other ranks run in
         # different order
         dist.set_debug_level(dist.DebugLevel.INFO)
@@ -141,12 +148,19 @@ def test_invalid_later_iter_order(
             loss = fsdp_model.module.get_loss(inp, output).to(self.device)
             fsdp_model.module.run_backward(loss)
         # Match the warning message with the following prefix
-        regex = "^(Forward order differs from that of the first iteration " \
-            f"on rank {self.rank} -- collectives are unchecked and may give " \
+        regex = (
+            "^(Forward order differs from that of the first iteration "
+            f"on rank {self.rank}. Collectives are unchecked and may give "
             "incorrect results or hang)"
-        context = self.assertWarnsRegex(
-            expected_warning=UserWarning, expected_regex=regex,
-        ) if self.rank != 0 else suppress()
+        )
+        context = (
+            self.assertWarnsRegex(
+                expected_warning=UserWarning,
+                expected_regex=regex,
+            )
+            if self.rank != 0
+            else suppress()
+        )
         if self.rank != 0:
             fsdp_model.flip_path()
         inp = fsdp_model.module.get_input(self.device)
@@ -189,7 +203,9 @@ def test_train_eval(self, sharding_strategy: ShardingStrategy):
         warning_prefix = "Forward order differs"
         for warning in w:
             if str(warning.message).startswith(warning_prefix):
-                raise AssertionError(f"Warning was incorrectly issued: {warning.message}")
+                raise AssertionError(
+                    f"Warning was incorrectly issued: {warning.message}"
+                )
         # If we still validate the forward execution order in eval mode, then
         # an `AssertionError` will be raised above for both sharding strategies
 
diff --git a/test/distributed/fsdp/test_fsdp_grad_acc.py b/test/distributed/fsdp/test_fsdp_grad_acc.py
index ae01b22ca66c..1e44f865027d 100644
--- a/test/distributed/fsdp/test_fsdp_grad_acc.py
+++ b/test/distributed/fsdp/test_fsdp_grad_acc.py
@@ -4,13 +4,16 @@
 import itertools
 import sys
 from dataclasses import dataclass
-from typing import List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 from torch import distributed as dist
 from torch.distributed.fsdp import CPUOffload
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.fsdp.fully_sharded_data_parallel import BackwardPrefetch
+from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    BackwardPrefetch,
+    ShardingStrategy,
+)
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
     CUDAInitMode,
@@ -88,6 +91,7 @@ def _test_grad_acc(
         configs: List[_GradAccConfig],
         cpu_offload: CPUOffload,
         backward_prefetch: Optional[BackwardPrefetch],
+        sharding_strategy: ShardingStrategy,
     ):
         """
         Tests gradient accumulation by comparing a run that trains sequentially
@@ -114,8 +118,10 @@ def _test_grad_acc(
         """
         # Gradient accumulation outside `no_sync()` is not currently compatible
         # with CPU offloading
-        if cpu_offload.offload_params and \
-                any(not config.use_no_sync for config in configs):
+        if (
+            cpu_offload.offload_params
+            and any(not config.use_no_sync for config in configs)
+        ):
             return
         old_allow_tf32 = torch.backends.cuda.matmul.allow_tf32
         try:
@@ -126,6 +132,7 @@ def _test_grad_acc(
             fsdp_kwargs = {
                 "cpu_offload": cpu_offload,
                 "backward_prefetch": backward_prefetch,
+                "sharding_strategy": sharding_strategy,
             }
             fsdp_model: FSDP = TransformerWithSharedParams.init(
                 self.process_group,
@@ -210,6 +217,16 @@ def permute_tensor(x: torch.Tensor):
         finally:
             torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32
 
+    def _get_subtest_config(self) -> Dict[str, List[Any]]:
+        """Returns a subtest configuration that subtests prefetching."""
+        return {
+            "backward_prefetch": [
+                None,
+                BackwardPrefetch.BACKWARD_PRE,
+                BackwardPrefetch.BACKWARD_POST,
+            ]
+        }
+
     @skip_if_lt_x_gpu(2)
     @parametrize(
         "configs",
@@ -231,14 +248,18 @@ def permute_tensor(x: torch.Tensor):
         [CPUOffload(offload_params=False), CPUOffload(offload_params=True)],
     )
     @parametrize(
-        "backward_prefetch",
-        [BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST, None],
+        "sharding_strategy",
+        [
+            ShardingStrategy.FULL_SHARD,
+            ShardingStrategy.SHARD_GRAD_OP,
+            ShardingStrategy.NO_SHARD,
+        ]
     )
     def test_grad_acc(
         self,
         configs: _GradAccConfigs,
         cpu_offload: CPUOffload,
-        backward_prefetch: Optional[BackwardPrefetch],
+        sharding_strategy: ShardingStrategy,
     ):
         """
         Tests gradient accumulation.
@@ -255,11 +276,13 @@ def test_grad_acc(
         manager is not currently compatible with CPU offloading, so those tests
         are vacuous.
         """
-        self._test_grad_acc(
+        self.run_subtests(
+            self._get_subtest_config(),
+            self._test_grad_acc,
             batch_dim=1,
             configs=configs.configs,
             cpu_offload=cpu_offload,
-            backward_prefetch=backward_prefetch,
+            sharding_strategy=sharding_strategy,
         )
 
 
diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py
index 5b152c1eb34f..f2ae0dcfcaea 100644
--- a/test/distributed/fsdp/test_fsdp_misc.py
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@@ -102,8 +102,9 @@ def forward(self, x):
                 return (a, b)
 
         def _check_resharded(fsdp_module):
-            for param in fsdp_module.params:
-                if param._is_sharded:
+            for handle in fsdp_module._handles:
+                param = handle.flat_param
+                if handle.uses_sharded_strategy:
                     full_param = param._full_param_padded
                     self.assertEqual(full_param.storage().size(), 0)
 
diff --git a/test/distributed/fsdp/test_fsdp_mixed_precision.py b/test/distributed/fsdp/test_fsdp_mixed_precision.py
index bd492518719f..cd2819a0b377 100644
--- a/test/distributed/fsdp/test_fsdp_mixed_precision.py
+++ b/test/distributed/fsdp/test_fsdp_mixed_precision.py
@@ -175,9 +175,13 @@ def forward(self, tup):
                 # local shard. This supports both FULL_SHARD and SHARD_GRAD_OP
                 # cases. In FULL_SHARD, we have the additional property that
                 # param._full_param_padded has not been freed.
+                param_is_sharded = (
+                    fsdp_module.sharding_strategy != ShardingStrategy.NO_SHARD
+                    and fsdp_module.world_size > 1
+                )
                 is_fsdp_unit_active = (
-                    param._is_sharded and
-                    (param.data.data_ptr() != param._local_shard.data_ptr())
+                    param_is_sharded
+                    and param.data.data_ptr() != param._local_shard.data_ptr()
                 )
                 if is_fsdp_unit_active:
                     num_active_fsdp += 1
@@ -190,7 +194,7 @@ def forward(self, tup):
                         cls.assertEqual(0, param._mp_shard.storage().size())
                     else:
                         cls.assertFalse(hasattr(param, '_mp_shard'))
-                elif param._is_sharded:
+                elif param_is_sharded:
                     # This FSDP unit is not active as full param has been
                     # freed or not yet allocated. Ensure param points to full
                     # precision param.
@@ -291,7 +295,6 @@ def _run_test_mixed_precision_e2e(
         mp_config,
         cpu_offload,
         backward_prefetch,
-        forward_prefetch,
         full_precision_param_dtype,
         sharding_strategy,
         enable_sharded_grad_scaler,
@@ -304,7 +307,6 @@ def _run_test_mixed_precision_e2e(
                 cpu_offload=cpu_offload,
                 mixed_precision=mp_config,
                 backward_prefetch=backward_prefetch,
-                forward_prefetch=forward_prefetch
             ),
             self._get_simple_nested_model(
                 param_dtype=full_precision_param_dtype,
@@ -312,7 +314,6 @@ def _run_test_mixed_precision_e2e(
                 cpu_offload=cpu_offload,
                 mixed_precision=mp_config,
                 backward_prefetch=backward_prefetch,
-                forward_prefetch=forward_prefetch
             ),
         ]
         for model in fsdp_models:
@@ -342,15 +343,11 @@ def _run_test_mixed_precision_e2e(
                         else:
                             self.assertEqual(buf.dtype, _BUFFER_ORIG_DTYPE)
                     # p._mp_shard should be freed.
-                    if model.params[0]._is_sharded:  # i.e. world_size > 1
-                        # TODO: free the mixed precision shard after forward
-                        # when world_size == 1 as well, currently when
-                        # world_size == 1 it is only freed after backward.
-                        if mp_config.param_dtype is not None:
-                            self._validate_mp_shard_freed(model)
-                        else:
-                            # We never should have allocated an _mp_shard.
-                            self._validate_no_mp_shard(model)
+                    if mp_config.param_dtype is not None:
+                        self._validate_mp_shard_freed(model)
+                    else:
+                        # We never should have allocated an _mp_shard.
+                        self._validate_no_mp_shard(model)
 
                     loss = act.sum()
                     loss = scaler.scale(loss)
@@ -439,7 +436,6 @@ def _get_subtest_config(self) -> Dict[str, List[Any]]:
         """Returns a subtest configuration that subtests prefetching settings
         together."""
         return {
-            "forward_prefetch": [False, True],
             "backward_prefetch": [
                 None,
                 BackwardPrefetch.BACKWARD_PRE,
@@ -456,7 +452,6 @@ def test_mixed_precision_no_reshard_after_forward(self):
             mp_config=mp,
             cpu_offload=CPUOffload(offload_params=True),
             backward_prefetch=None,
-            forward_prefetch=False,
             full_precision_param_dtype=torch.float64,
             sharding_strategy=ShardingStrategy.SHARD_GRAD_OP,
             enable_sharded_grad_scaler=False,
@@ -655,7 +650,6 @@ def test_mixed_precision_no_reshard_after_forward(self):
             mp_config=mp,
             cpu_offload=CPUOffload(offload_params=True),
             backward_prefetch=None,
-            forward_prefetch=False,
             full_precision_param_dtype=torch.float64,
             sharding_strategy=ShardingStrategy.SHARD_GRAD_OP,
             enable_sharded_grad_scaler=False,
@@ -668,7 +662,6 @@ def test_mixed_precision_e2e_full_shard(self):
             mp_config=mp,
             cpu_offload=CPUOffload(offload_params=True),
             backward_prefetch=None,
-            forward_prefetch=False,
             full_precision_param_dtype=torch.float64,
             sharding_strategy=ShardingStrategy.FULL_SHARD,
             enable_sharded_grad_scaler=False,
diff --git a/test/distributed/fsdp/test_fsdp_optim_state.py b/test/distributed/fsdp/test_fsdp_optim_state.py
index c7ec75d5db79..5dc6ffff1e42 100644
--- a/test/distributed/fsdp/test_fsdp_optim_state.py
+++ b/test/distributed/fsdp/test_fsdp_optim_state.py
@@ -473,6 +473,23 @@ def test_optim_state_dict_nested(
         are incorrectly mapped to values. Their correct mapping is tested in
         other tests that exercise the save/load workflow.
         """
+        self.run_subtests(
+            {"use_optim_input": [False, True]},
+            self._test_optim_state_dict_nested,
+            state_dict_type=state_dict_type,
+            use_multiple_param_groups=use_multiple_param_groups,
+            rank0_only=rank0_only,
+            use_diff_optim_inputs=use_diff_optim_inputs,
+        )
+
+    def _test_optim_state_dict_nested(
+        self,
+        state_dict_type: StateDictType,
+        use_multiple_param_groups: bool,
+        rank0_only: bool,
+        use_diff_optim_inputs: bool,
+        use_optim_input: bool,
+    ) -> None:
         if rank0_only and state_dict_type == StateDictType.SHARDED_STATE_DICT:
             return  # not supported
         NUM_ITERS = 3
@@ -482,13 +499,19 @@ def test_optim_state_dict_nested(
         )
         losses1 = self._step_model(model1, optim1, num_iters=NUM_ITERS)
         if state_dict_type == StateDictType.FULL_STATE_DICT:
-            fsdp_osd = FSDP.full_optim_state_dict(
-                model1, optim1, optim_input, rank0_only=rank0_only,
-            )
+            if use_optim_input:
+                fsdp_osd = FSDP.full_optim_state_dict(
+                    model1, optim1, optim_input, rank0_only=rank0_only,
+                )
+            else:
+                fsdp_osd = FSDP.full_optim_state_dict(
+                    model1, optim1, rank0_only=rank0_only,
+                )
         else:
-            fsdp_osd = FSDP.sharded_optim_state_dict(
-                model1, optim1, optim_input
-            )
+            if use_optim_input:
+                fsdp_osd = FSDP.sharded_optim_state_dict(model1, optim1, optim_input)
+            else:
+                fsdp_osd = FSDP.sharded_optim_state_dict(model1, optim1)
         # Non-target ranks get an empty state dict
         if rank0_only and self.rank != 0:
             self.assertEqual(len(fsdp_osd), 0)
@@ -557,9 +580,7 @@ def test_full_optim_state_dict_nested_invalid(self):
             "are missing some of those states"
         )
         with self.assertRaisesRegex(RuntimeError, error_regex):
-            FSDP.full_optim_state_dict(
-                model, optim, optim_input,
-            )
+            FSDP.full_optim_state_dict(model, optim)
 
     @skip_if_lt_x_gpu(2)
     @parametrize("use_multiple_param_groups", [False, True])
@@ -573,7 +594,9 @@ def test_shard_full_optim_state_dict_nested(
     ):
         """Tests :meth:`shard_full_optim_state_dict` for a non-FSDP-root model
         with nested FSDP instances."""
-        self._test_load_optim_state(
+        self.run_subtests(
+            {"use_optim_input": [False, True]},
+            self._test_load_optim_state,
             model_class=_ModelClass.NESTED,
             use_multiple_param_groups=use_multiple_param_groups,
             halve_world_size=False,
@@ -591,7 +614,9 @@ def test_shard_full_optim_state_dict_nested_halve_world_size(self):
         use_multiple_param_groups = True
         use_diff_optim_inputs = True
         wrap_alt = True
-        self._test_load_optim_state(
+        self.run_subtests(
+            {"use_optim_input": [False, True]},
+            self._test_load_optim_state,
             model_class=_ModelClass.NESTED,
             use_multiple_param_groups=use_multiple_param_groups,
             halve_world_size=True,
@@ -604,7 +629,9 @@ def test_shard_full_optim_state_dict_nested_halve_world_size(self):
     def test_shard_full_optim_state_dict_transformer(self) -> None:
         """Tests :meth:`shard_full_optim_state_dict` for an FSDP-root
         transformer model with shared parameters."""
-        self._test_load_optim_state(
+        self.run_subtests(
+            {"use_optim_input": [False, True]},
+            self._test_load_optim_state,
             model_class=_ModelClass.TRANSFORMER,
             use_multiple_param_groups=False,
             halve_world_size=True,
@@ -624,7 +651,9 @@ def test_scatter_full_optim_state_dict_nested(
     ):
         """Tests :meth:`scatter_full_optim_state_dict` for a non-FSDP-root
         model with nested FSDP instances."""
-        self._test_load_optim_state(
+        self.run_subtests(
+            {"use_optim_input": [False, True]},
+            self._test_load_optim_state,
             model_class=_ModelClass.NESTED,
             use_multiple_param_groups=use_multiple_param_groups,
             halve_world_size=False,
@@ -642,7 +671,9 @@ def test_scatter_full_optim_state_dict_nested_halve_world_size(self):
         use_multiple_param_groups = True
         use_diff_optim_inputs = True
         wrap_alt = True
-        self._test_load_optim_state(
+        self.run_subtests(
+            {"use_optim_input": [False, True]},
+            self._test_load_optim_state,
             model_class=_ModelClass.NESTED,
             use_multiple_param_groups=use_multiple_param_groups,
             halve_world_size=True,
@@ -655,7 +686,9 @@ def test_scatter_full_optim_state_dict_nested_halve_world_size(self):
     def test_scatter_full_optim_state_dict_transformer(self) -> None:
         """Tests :meth:`scatter_full_optim_state_dict` for an FSDP-root
         transformer model with shared parameters."""
-        self._test_load_optim_state(
+        self.run_subtests(
+            {"use_optim_input": [False, True]},
+            self._test_load_optim_state,
             model_class=_ModelClass.TRANSFORMER,
             use_multiple_param_groups=False,
             halve_world_size=True,
@@ -667,7 +700,9 @@ def test_scatter_full_optim_state_dict_transformer(self) -> None:
     def test_flatten_sharded_optim_state_dict_nested(self):
         """Tests :meth:`flatten_sharded_optim_state_dict` for an FSDP-root
         nested model."""
-        self._test_load_optim_state(
+        self.run_subtests(
+            {"use_optim_input": [False, True]},
+            self._test_load_optim_state,
             _ModelClass.NESTED,
             use_multiple_param_groups=False,
             halve_world_size=False,
@@ -680,7 +715,9 @@ def test_flatten_sharded_optim_state_dict_nested(self):
     def test_flatten_sharded_optim_state_dict_transformer(self) -> None:
         """Tests :meth:`flatten_sharded_optim_state_dict` for an FSDP-root
         transformer model."""
-        self._test_load_optim_state(
+        self.run_subtests(
+            {"use_optim_input": [False, True]},
+            self._test_load_optim_state,
             _ModelClass.TRANSFORMER,
             use_multiple_param_groups=False,
             halve_world_size=False,
@@ -695,6 +732,7 @@ def _test_load_optim_state(
         halve_world_size: bool,
         osd_comm_method: _OSDCommMethod,
         use_diff_optim_inputs: bool,
+        use_optim_input: bool,
         **new_model_kwargs,
     ):
         """
@@ -722,7 +760,10 @@ def _test_load_optim_state(
             wrap=True, use_multiple_param_groups=use_multiple_param_groups,
         )
         self._step_model(model1, optim1, num_iters=NUM_ITERS)
-        fsdp_osd1 = osd_method(model1, optim1, optim_input1)
+        fsdp_osd1 = (
+            osd_method(model1, optim1, optim_input1) if use_optim_input
+            else osd_method(model1, optim1)
+        )
         if halve_world_size:
             # Create a new process group with halved world size
             new_group_ranks = [r for r in range(self.world_size) if r % 2 == 0]
@@ -741,36 +782,74 @@ def _test_load_optim_state(
             **new_model_kwargs,  # specify `wrap_alt` to change wrapping
         )
         self._step_model(model2, optim2, num_iters=NUM_ITERS)
-        fsdp_osd2 = osd_method(model2, optim2, optim_input2, group=new_group)
+        fsdp_osd2 = (
+            osd_method(model2, optim2, optim_input2, group=new_group)
+            if use_optim_input
+            else osd_method(model2, optim2, group=new_group)
+        )
         # Compute two sharded optim state dicts: (1) for the first model
         # according to the second model and (2) for the second model according
         # to the second model
         if osd_comm_method == _OSDCommMethod.BROADCAST_OBJECT_LIST:
             fsdp_osd1 = self._broadcast_full_osd(fsdp_osd1, group=new_group)
-            sharded_osd1 = FSDP.shard_full_optim_state_dict(
-                fsdp_osd1, model2, optim_input2,
+            sharded_osd1 = (
+                FSDP.shard_full_optim_state_dict(fsdp_osd1, model2, optim_input=optim_input2)
+                if use_optim_input
+                else FSDP.shard_full_optim_state_dict(fsdp_osd1, model2, optim=optim2)
             )
             fsdp_osd2 = self._broadcast_full_osd(fsdp_osd2, group=new_group)
-            sharded_osd2 = FSDP.shard_full_optim_state_dict(
-                fsdp_osd2, model2, optim_input2,
+            sharded_osd2 = (
+                FSDP.shard_full_optim_state_dict(fsdp_osd2, model2, optim_input=optim_input2)
+                if use_optim_input
+                else FSDP.shard_full_optim_state_dict(fsdp_osd2, model2, optim=optim2)
             )
         elif osd_comm_method == _OSDCommMethod.SCATTER_FULL_OSD:
-            sharded_osd1 = FSDP.scatter_full_optim_state_dict(
-                fsdp_osd1 if self.rank == 0 else None, model2, optim_input2,
-                group=new_group,
+            sharded_osd1 = (
+                FSDP.scatter_full_optim_state_dict(
+                    fsdp_osd1 if self.rank == 0 else None,
+                    model2,
+                    optim_input=optim_input2,
+                    group=new_group,
+                ) if use_optim_input
+                else FSDP.scatter_full_optim_state_dict(
+                    fsdp_osd1 if self.rank == 0 else None,
+                    model2,
+                    optim=optim2,
+                    group=new_group,
+                )
             )
-            sharded_osd2 = FSDP.scatter_full_optim_state_dict(
-                fsdp_osd2 if self.rank == 0 else None, model2, optim_input2,
-                group=new_group,
+            sharded_osd2 = (
+                FSDP.scatter_full_optim_state_dict(
+                    fsdp_osd2 if self.rank == 0 else None,
+                    model2,
+                    optim_input=optim_input2,
+                    group=new_group,
+                ) if use_optim_input
+                else FSDP.scatter_full_optim_state_dict(
+                    fsdp_osd2 if self.rank == 0 else None,
+                    model2,
+                    optim=optim2,
+                    group=new_group,
+                )
             )
             self._check_state_device(sharded_osd1, on_gpu=True)
             self._check_state_device(sharded_osd2, on_gpu=True)
         elif osd_comm_method == _OSDCommMethod.FLATTEN_SHARDED_OSD:
-            sharded_osd1 = FSDP.flatten_sharded_optim_state_dict(
-                fsdp_osd1, model2, optim_input2,
+            sharded_osd1 = (
+                FSDP.flatten_sharded_optim_state_dict(
+                    fsdp_osd1, model2, optim_input=optim_input2,
+                ) if use_optim_input
+                else FSDP.flatten_sharded_optim_state_dict(
+                    fsdp_osd1, model2, optim=optim2,
+                )
             )
-            sharded_osd2 = FSDP.flatten_sharded_optim_state_dict(
-                fsdp_osd2, model2, optim_input2,
+            sharded_osd2 = (
+                FSDP.flatten_sharded_optim_state_dict(
+                    fsdp_osd2, model2, optim_input=optim_input2,
+                ) if use_optim_input
+                else FSDP.flatten_sharded_optim_state_dict(
+                    fsdp_osd2, model2, optim=optim2,
+                )
             )
 
         # As a sanity check, check that sharding the second model's full/sharded
@@ -827,17 +906,36 @@ def test_shard_full_optim_state_dict_unmanaged_params(
         to save CI cost since it call into the same subroutine
         :meth:`_flatten_optim_state_dict`.
         """
+        self.run_subtests(
+            {"use_optim_input": [False, True]},
+            self._test_shard_full_optim_state_dict_unmanaged_params,
+            state_dict_type=state_dict_type,
+            add_to_fsdp_module=add_to_fsdp_module,
+        )
+
+    def _test_shard_full_optim_state_dict_unmanaged_params(
+        self,
+        state_dict_type: StateDictType,
+        add_to_fsdp_module: bool,
+        use_optim_input: bool,
+    ):
         NUM_ITERS = 1
         # Create a normal wrapped model
         model, optim, optim_input = self._init_nested_model(wrap=True)
         self._step_model(model, optim, num_iters=NUM_ITERS)
 
         if state_dict_type == StateDictType.FULL_STATE_DICT:
-            fsdp_osd = FSDP.full_optim_state_dict(
-                model, optim, optim_input, rank0_only=False,
+            fsdp_osd = (
+                FSDP.full_optim_state_dict(model, optim, optim_input, rank0_only=False)
+                if use_optim_input
+                else FSDP.full_optim_state_dict(model, optim, rank0_only=False)
             )  # save on all ranks to avoid having to broadcast from rank 0
         else:
-            fsdp_osd = FSDP.sharded_optim_state_dict(model, optim, optim_input)
+            fsdp_osd = (
+                FSDP.sharded_optim_state_dict(model, optim, optim_input)
+                if use_optim_input
+                else FSDP.sharded_optim_state_dict(model, optim)
+            )
         # Create a new model with the same structure but additional unmanaged
         # parameters, representing the model for which we want to load
         device = torch.device("cuda")
@@ -846,6 +944,7 @@ def test_shard_full_optim_state_dict_unmanaged_params(
             model, add_to_fsdp_module,
         )
         optim_input = list(model.parameters())
+        optim = torch.optim.Adam(optim_input, lr=1e-3)
         if add_to_fsdp_module:
             # If we add the unmanaged parameters to a module wrapped with FSDP,
             # then the flattened parameter will be comprised of some
@@ -857,12 +956,16 @@ def test_shard_full_optim_state_dict_unmanaged_params(
                 "same value and dtype)"
             with self.assertRaisesRegex(ValueError, error_prefix):
                 if state_dict_type == StateDictType.FULL_STATE_DICT:
-                    FSDP.shard_full_optim_state_dict(
-                        fsdp_osd, model, optim_input,
+                    (
+                        FSDP.shard_full_optim_state_dict(fsdp_osd, model, optim_input=optim_input)
+                        if use_optim_input
+                        else FSDP.shard_full_optim_state_dict(fsdp_osd, model, optim=optim)
                     )
                 else:
-                    FSDP.flatten_sharded_optim_state_dict(
-                        fsdp_osd, model, optim_input,
+                    (
+                        FSDP.flatten_sharded_optim_state_dict(fsdp_osd, model, optim_input=optim_input)
+                        if use_optim_input
+                        else FSDP.flatten_sharded_optim_state_dict(fsdp_osd, model, optim=optim)
                     )
         else:
             # If we add the unmanaged parameters to a module not wrapped with
@@ -870,12 +973,16 @@ def test_shard_full_optim_state_dict_unmanaged_params(
             # model parallelism use cases, where some parameters are managed
             # externally to FSDP
             if state_dict_type == StateDictType.FULL_STATE_DICT:
-                flattened_osd = FSDP.shard_full_optim_state_dict(
-                    fsdp_osd, model, optim_input,
+                flattened_osd = (
+                    FSDP.shard_full_optim_state_dict(fsdp_osd, model, optim_input=optim_input)
+                    if use_optim_input
+                    else FSDP.shard_full_optim_state_dict(fsdp_osd, model, optim=optim)
                 )
             else:
-                flattened_osd = FSDP.flatten_sharded_optim_state_dict(
-                    fsdp_osd, model, optim_input,
+                flattened_osd = (
+                    FSDP.flatten_sharded_optim_state_dict(fsdp_osd, model, optim_input=optim_input)
+                    if use_optim_input
+                    else FSDP.flatten_sharded_optim_state_dict(fsdp_osd, model, optim=optim)
                 )
             # Add entries for the unmanaged parameters to be able to load
             for unmanaged_param in unmanaged_params:
@@ -883,7 +990,6 @@ def test_shard_full_optim_state_dict_unmanaged_params(
                     flattened_osd, unmanaged_param, NUM_ITERS,
                 )
             # Check that we can load the optimizer state dict
-            optim = torch.optim.Adam(optim_input, lr=1e-3)
             optim.load_state_dict(flattened_osd)
 
     @skip_if_lt_x_gpu(2)
@@ -898,6 +1004,20 @@ def test_rekey_optim_state_dict_to_ids(
         parameter IDs by checking that a wrapped model (i.e. with FSDP modules)
         can rekey its optimizer state dict to match that of an equivalent
         non-wrapped model (i.e. without FSDP modules)."""
+        self.run_subtests(
+            {"use_optim_input": [False, True]},
+            self._test_rekey_optim_state_dict_to_ids,
+            state_dict_type=state_dict_type,
+            use_multiple_param_groups=use_multiple_param_groups,
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def _test_rekey_optim_state_dict_to_ids(
+        self,
+        state_dict_type: StateDictType,
+        use_multiple_param_groups: bool,
+        use_optim_input: bool,
+    ):
         NUM_ITERS = 3
         # Run a wrapped model for a few iterations
         model1, optim1, optim_input1 = self._init_nested_model(
@@ -905,12 +1025,20 @@ def test_rekey_optim_state_dict_to_ids(
         )
         self._step_model(model1, optim1, num_iters=NUM_ITERS)
         if state_dict_type == StateDictType.FULL_STATE_DICT:
-            fsdp_osd = FSDP.full_optim_state_dict(model1, optim1, optim_input1)
+            fsdp_osd = (
+                FSDP.full_optim_state_dict(model1, optim1, optim_input1)
+                if use_optim_input
+                else FSDP.full_optim_state_dict(model1, optim1)
+            )
             # Broadcast instead of `torch.save()`/`torch.load()` so that all ranks
             # have the full state dict
             fsdp_osd = self._broadcast_full_osd(fsdp_osd)
         else:
-            fsdp_osd = FSDP.sharded_optim_state_dict(model1, optim1, optim_input1)
+            fsdp_osd = (
+                FSDP.sharded_optim_state_dict(model1, optim1, optim_input1)
+                if use_optim_input
+                else FSDP.sharded_optim_state_dict(model1, optim1)
+            )
         # Run a non-wrapped model for a few iterations
         model2, optim2, optim_input2 = self._init_nested_model(
             wrap=False, use_multiple_param_groups=use_multiple_param_groups,
@@ -918,8 +1046,14 @@ def test_rekey_optim_state_dict_to_ids(
         self._step_model(model2, optim2, num_iters=NUM_ITERS)
         # Re-key the wrapped model's optimizer state dict using parameter IDs
         # according to the non-wrapped model
-        rekeyed_osd = FSDP.rekey_optim_state_dict(
-            fsdp_osd, OptimStateKeyType.PARAM_ID, model2, optim_input2,
+        rekeyed_osd = (
+            FSDP.rekey_optim_state_dict(
+                fsdp_osd, OptimStateKeyType.PARAM_ID, model2, optim_input=optim_input2,
+            )
+            if use_optim_input
+            else FSDP.rekey_optim_state_dict(
+                fsdp_osd, OptimStateKeyType.PARAM_ID, model2, optim=optim2,
+            )
         )
         # Check that the re-keyed dict and actual dict are the same
         osd = optim2.state_dict()
@@ -936,17 +1070,25 @@ def test_rekey_optim_state_dict_to_ids(
             self._step_model(model2, optim2, num_iters=NUM_ITERS)
 
     @skip_if_lt_x_gpu(2)
-    @parametrize("use_multiple_param_groups", [False])
-    def test_rekey_optim_state_dict_to_names(
-        self,
-        use_multiple_param_groups: bool,
-    ):
+    def test_rekey_optim_state_dict_to_names(self):
         """Tests :meth:`rekey_optim_state_dict` with the new keys being
         parameter names by checking that a non-wrapped model (i.e. without FSDP
         modules) can rekey its optimizer state dict to match the expected
         output of :meth:`full_optim_state_dict`, hence be sharded using
         :meth:`shard_full_optim_state_dict`, and finally match the per-rank
         optimizer state dict of a wrapped model (i.e. with FSDP modules)."""
+        self.run_subtests(
+            {"use_optim_input": [False, True]},
+            self._test_rekey_optim_state_dict_to_names,
+            use_multiple_param_groups=False,
+        )
+
+    def _test_rekey_optim_state_dict_to_names(
+        self,
+        use_multiple_param_groups: bool,
+        use_optim_input: bool,
+    ):
+
         NUM_ITERS = 3
         # Run a wrapped model for a few iterations
         model1, optim1, optim_input1 = self._init_nested_model(
@@ -961,13 +1103,23 @@ def test_rekey_optim_state_dict_to_names(
         # Re-key the non-wrapped model's optimizer state dict using parameter
         # names (still according to itself)
         osd2 = optim2.state_dict()
-        rekeyed_osd = FSDP.rekey_optim_state_dict(
-            osd2, OptimStateKeyType.PARAM_NAME, model2, optim_input2,
+        rekeyed_osd = (
+            FSDP.rekey_optim_state_dict(
+                osd2, OptimStateKeyType.PARAM_NAME, model2, optim_input=optim_input2,
+            ) if use_optim_input
+            else FSDP.rekey_optim_state_dict(
+                osd2, OptimStateKeyType.PARAM_NAME, model2, optim=optim2,
+            )
         )
         # Shard the non-wrapped model's re-keyed optimizer state dict, which
         # maps back to (flattened) parameter IDs
-        sharded_osd = FSDP.shard_full_optim_state_dict(
-            rekeyed_osd, model1, optim_input1,
+        sharded_osd = (
+            FSDP.shard_full_optim_state_dict(
+                rekeyed_osd, model1, optim_input=optim_input1,
+            ) if use_optim_input
+            else FSDP.shard_full_optim_state_dict(
+                rekeyed_osd, model1, optim=optim1,
+            )
         )
         # Check that this sharded optimizer state dict matches the wrapped
         # model's per-rank optimizer state dict
@@ -983,6 +1135,59 @@ def test_rekey_optim_state_dict_to_names(
         optim1.load_state_dict(sharded_osd)
         self._step_model(model1, optim1, num_iters=NUM_ITERS)
 
+    @skip_if_lt_x_gpu(2)
+    def test_optim_input_warning(self):
+        """Tests that passing the ``optim_input`` argument into optimizer state
+        checkpointing APIs issues a warning."""
+        wrapped_model, wrapped_optim, wrapped_optim_input = (
+            self._init_nested_model(wrap=True, use_multiple_param_groups=False)
+        )
+        self._step_model(wrapped_model, wrapped_optim, num_iters=2)
+
+        def get_warning_context():
+            warning_regex = "`optim_input` argument is deprecated"
+            return self.assertWarnsRegex(
+                expected_warning=UserWarning, expected_regex=warning_regex
+            )
+
+        # Sharded optim state dict
+        with get_warning_context():
+            fsdp_osd = FSDP.sharded_optim_state_dict(wrapped_model, wrapped_optim, optim_input=wrapped_optim_input)
+        with get_warning_context():
+            FSDP.flatten_sharded_optim_state_dict(fsdp_osd, wrapped_model, optim_input=wrapped_optim_input)
+        # Full optim state dict
+        with get_warning_context():
+            fsdp_osd = FSDP.full_optim_state_dict(
+                wrapped_model,
+                wrapped_optim,
+                optim_input=wrapped_optim_input,
+                rank0_only=False,
+            )
+        with get_warning_context():
+            FSDP.shard_full_optim_state_dict(fsdp_osd, wrapped_model, optim_input=wrapped_optim_input)
+        with get_warning_context():
+            FSDP.scatter_full_optim_state_dict(fsdp_osd, wrapped_model, optim_input=wrapped_optim_input)
+        # Rekey optim state dict
+        nonwrapped_model, nonwrapped_optim, nonwrapped_optim_input = (
+            self._init_nested_model(wrap=False, use_multiple_param_groups=False)
+        )
+        with get_warning_context():
+            rekeyed_osd = FSDP.rekey_optim_state_dict(
+                fsdp_osd,  # from `full_optim_state_dict()`
+                OptimStateKeyType.PARAM_ID,
+                nonwrapped_model,
+                optim_input=nonwrapped_optim_input,
+            )
+        self._step_model(nonwrapped_model, nonwrapped_optim, num_iters=2)
+        osd = nonwrapped_optim.state_dict()
+        with get_warning_context():
+            FSDP.rekey_optim_state_dict(
+                osd,
+                OptimStateKeyType.PARAM_NAME,
+                nonwrapped_model,
+                optim_input=nonwrapped_optim_input,
+            )
+
 
 instantiate_parametrized_tests(TestFSDPOptimState)
 
diff --git a/test/distributed/fsdp/test_fsdp_overlap.py b/test/distributed/fsdp/test_fsdp_overlap.py
index 36f9ed9454e1..07e8eba09c6c 100644
--- a/test/distributed/fsdp/test_fsdp_overlap.py
+++ b/test/distributed/fsdp/test_fsdp_overlap.py
@@ -109,6 +109,12 @@ def run(compute_cycles, all_gather_cycles):
             batch = torch.rand(1).cuda()
             batch.requires_grad = True
 
+            # Run one dummy iteration to trigger the execution order validation
+            # all-gathers
+            out = model(batch)
+            out.backward()
+            model.zero_grad(set_to_none=True)
+
             # We run 20 iterations but only collect timing data from the minimal 10
             # data points because nondeterministic system events can disturb the timing.
             cpu_iter = Min10()
diff --git a/test/distributed/fsdp/test_wrap.py b/test/distributed/fsdp/test_wrap.py
index 0008f8d23a94..ecea49362723 100644
--- a/test/distributed/fsdp/test_wrap.py
+++ b/test/distributed/fsdp/test_wrap.py
@@ -229,13 +229,11 @@ def wrap_bn_container(module, recurse, *args, **kwargs):
         "backward_prefetch",
         [BackwardPrefetch.BACKWARD_POST, BackwardPrefetch.BACKWARD_PRE]
     )
-    @parametrize("forward_prefetch", [True, False])
     @parametrize(
         "cuda_init_mode",
         [CUDAInitMode.CUDA_AFTER, CUDAInitMode.CUDA_BEFORE]
     )
-    def test_main_wrap_api(self, cpu_offload, backward_prefetch, forward_prefetch, cuda_init_mode):
-
+    def test_main_wrap_api(self, cpu_offload, backward_prefetch, cuda_init_mode):
         if cuda_init_mode == CUDAInitMode.CUDA_AFTER and cpu_offload.offload_params:
             # they don't work together, expected
             return
@@ -270,7 +268,6 @@ def forward(self, input):
             ),
             cpu_offload=cpu_offload,
             backward_prefetch=backward_prefetch,
-            forward_prefetch=forward_prefetch,
         )
         if cuda_init_mode == CUDAInitMode.CUDA_AFTER:
             wrapped_model = wrapped_model.cuda()
@@ -288,7 +285,6 @@ def forward(self, input):
             self.assertTrue(isinstance(module, FSDP))
             self._check_cpu_offload(module, cpu_offload)
             self._check_backward_prefetch(module, backward_prefetch)
-            self._check_forward_prefetch(module, forward_prefetch)
 
         # Run model a few times for sanity check.
         optim = torch.optim.SGD(wrapped_model.parameters(), lr=1e-2, momentum=0.9)
@@ -299,14 +295,6 @@ def forward(self, input):
             loss.backward()
             optim.step()
 
-        # Since we ran with backward prefetch, verify backward prefetch related
-        # data.
-        for i, module in enumerate(modules_in_fsdp_graph_order):
-            self.assertEqual(i, module._my_fsdp_idx_in_graph)
-            self.assertTrue(
-                module._fsdp_graph_order == modules_in_fsdp_graph_order
-            )
-
 
 class TestAutoWrap(TestCase):
     def setUp(self) -> None:
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 951b601fd90b..065c050a9d72 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -7,12 +7,10 @@
 import threading
 import time
 from contextlib import suppress
-from dataclasses import dataclass
 from datetime import timedelta
-from functools import partial
 from itertools import product
 from sys import platform
-from typing import Any, Callable, Tuple
+from typing import Callable
 
 import torch
 import torch.distributed as dist
@@ -26,15 +24,8 @@
 import torch.nn.functional as F
 import torch.testing._internal.common_utils as common
 from torch import nn
-from torch._C import _disabled_torch_function_impl
-from torch.fx.experimental.proxy_tensor import (
-    _ProxyTensor,
-    fetch_tensor_proxy,
-    get_proxy_slots,
-    make_fx,
-    set_proxy_slot,
-    track_tensor_tree,
-)
+from torch.distributed._spmd.comm_tensor import _wait_comm, CommTensor
+from torch.fx.experimental.proxy_tensor import make_fx
 from torch.nn.parallel import DistributedDataParallel
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
@@ -48,12 +39,6 @@
     instantiate_parametrized_tests,
     parametrize
 )
-from torch.utils._mode_utils import no_dispatch
-from torch.utils._pytree import (
-    tree_flatten,
-    tree_map,
-    tree_map_only,
-)
 from torch.utils.checkpoint import checkpoint
 
 
@@ -987,6 +972,10 @@ def op_timeout_sec(self):
     def world_size(self):
         return 2
 
+    @property
+    def device(self):
+        self.fail("test subclass didn't override device")
+
     def _verify_sequence_number_across_pg(self, pg, verify_pg):
 
         seq_num = pg._get_sequence_number_for_group()
@@ -1159,7 +1148,81 @@ def _test_rank_membership(self, backend):
 
         self.assertEqual(dist.get_process_group_ranks(group), [1])
 
+    def _test_tensor_dtype_mismatch(self, backend):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend,
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+
+        tensor = torch.ones(2, 2, device=self.device) * 7
+        tensor_h = tensor.half()
+        tensor_list = [torch.zeros(2, 2, device=self.device) for _ in range(self.world_size)]
+        tensor_list_h = list(tensor_list)
+        tensor_list_h[1] = tensor_list_h[1].half()
+
+
+        with self.assertRaisesRegex(RuntimeError, "tensors with different dtypes"):
+            dist.all_gather(tensor_list_h, tensor)
+
+        with self.assertRaisesRegex(RuntimeError, "tensors with different dtypes"):
+            dist.all_gather(tensor_list, tensor_h)
+
+        with self.assertRaisesRegex(RuntimeError, "tensors with different dtypes"):
+            dist.all_gather_coalesced([tensor_list_h], tensor_list)
+            dist.all_gather_coalesced([tensor_list], tensor_list_h)
+
+        with self.assertRaisesRegex(RuntimeError, "tensors with different dtypes"):
+            dist.all_reduce_coalesced(tensor_list_h)
+
+        with self.assertRaisesRegex(RuntimeError, "tensors with different dtypes"):
+            dist.reduce_scatter(tensor, tensor_list_h)
+
+        with self.assertRaisesRegex(RuntimeError, "tensors with different dtypes"):
+            dist.reduce_scatter(tensor_h, tensor_list)
+
+        with self.assertRaisesRegex(RuntimeError, "tensors with different dtypes"):
+            dist.all_to_all_single(tensor_h, tensor)
+
+        with self.assertRaisesRegex(RuntimeError, "tensors with different dtypes"):
+            dist.all_to_all(tensor_list_h, tensor_list)
+
+        with self.assertRaisesRegex(RuntimeError, "tensors with different dtypes"):
+            dist.all_to_all(tensor_list, tensor_list_h)
+
+        with self.assertRaisesRegex(RuntimeError, "tensors with different dtypes"):
+            dist.scatter(tensor, tensor_list_h)
+
+        with self.assertRaisesRegex(RuntimeError, "tensors with different dtypes"):
+            dist.gather(tensor_h, tensor_list)
+
+        with self.assertRaisesRegex(RuntimeError, "tensors with different dtypes"):
+            dist.gather(tensor, tensor_list_h)
+
+        with self.assertRaisesRegex(RuntimeError, "tensors with different dtypes"):
+            dist.scatter(tensor_h, tensor_list)
+
+    def _test_tensor_dtype_complex(self, backend):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend,
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+
+        tensor = torch.rand(2, device=self.device)
+        tensor_c = torch.view_as_complex(tensor)
+        tensor_list = [torch.rand(2, device=self.device) for _ in range(self.world_size)]
+        tensor_list_c = list(tensor_list)
+        tensor_list_c[1] = torch.view_as_complex(tensor_list_c[1])
 
+        dist.all_gather(tensor_list, tensor)
+        dist.all_gather(tensor_list, tensor_c)
+        dist.all_gather(tensor_list_c, tensor)
+        dist.all_gather(tensor_list_c, tensor_c)
 
 class CommTest(AbstractCommTest, MultiProcessTestCase):
     def setUp(self):
@@ -1370,225 +1433,6 @@ def test_send_recv(self):
 instantiate_parametrized_tests(CommonDistributedDataParallelTest)
 
 
-def wait_comm(comm_result):
-    # This function is only used by tracing mode as a call_function node right
-    # before consuming a collective result tensor.
-    comm_result._work.wait()
-    return comm_result._tensor
-
-
-@dataclass
-class CommResult:
-    # a custom type wrapping both inplace output tensor and work handle
-    _tensor: torch.Tensor
-    _work: torch.classes.c10d.Work
-
-
-def wrap_comm_result(result: Tuple[Any]) -> Tuple[Any]:
-    def wrap(work, e):
-        assert isinstance(e, torch.Tensor), (
-            "Excepting collection of tensors as the first element in the "
-            "return value of communication operations."
-        )
-
-        return CommResult(e, work)
-
-    # E.g.,
-    # allreduce_ returns ([tensor], work)
-    # allgather_ returns ([[tensor1, tensor2]], work)
-    work = result[1]
-    return (tree_map(partial(wrap, work), result[0]), work)
-
-
-class CommTensor(torch.Tensor):
-    r"""
-    A Tensor subclass to wrap input tensors for collective communications. This
-    Tensor subclass works for both eager and tracing mode.
-
-    In eager mode, it will record whether the inplace collective communication
-    has been launched using this Tensor and remember the corresponding work
-    handle. If yes, it will expliclty call wait() in the ``__torch_dispatch__``
-    function before subsequent operations consuming the value of the Tensor.
-
-    In tracing mode, ``CommTensor`` inserts two node into the graph using the
-    ``__torch_dispatch__`` function.
-
-    1. The first node is inserted right after the
-    communication, wrapping both the inplace output tensor and the returned
-    work handle into a custom CommResult type. We have to do this because
-    ``ProxyTorchDispatchMode`` only handles ``torch.Tensor``, ``_ProxyTensor``,
-    and ``torch.nn.Parameter`` objects and will treat the work handle
-    as a constant and embed that into the graph. As a result, during execution,
-    it will use the work handle created during tracing and will lead to wrong
-    result. The solution in this test is to manually create a proxy on the
-    return value of ``allreduce_`` which is ``([tensor], work)``, and wrap that
-    to ``[(CommResult(tensor, work)), work]``. In this way, subsequent nodes can
-    directly consume ``CommResult``.
-    2. The second node is inserted right before any subsequent node reads from
-    ``CommResult``. It will call ``wait()`` on the stashed work handle to ensure
-    that computation waits for communication.
-
-    It is specifically tailored for allreduce_ at the moment.
-    """
-
-    _supported_comms = [
-        "allreduce_",
-        "allgather_",
-        "broadcast_",
-        "reduce_scatter_",
-        "scatter_",
-    ]
-
-    @staticmethod
-    def __new__(cls, tensor: torch.Tensor):
-        r = torch.Tensor._make_subclass(  # type: ignore[attr-defined]
-            cls,
-            tensor,
-            require_grad=tensor.requires_grad,
-        )
-        # The tensor object wrapped by this CommTensor
-        r._tensor: torch.Tensor = tensor
-        # Record whether communication has launched on this tensor.
-        r._after_comm: bool = False
-        return r
-
-    def __repr__(self):
-        return f"CommTensor({self._tensor}, after_comm={self._after_comm})"
-
-    # disable __torch_function__ so that CommTensor can recursively dispatch
-    # with ProxyTorchDispatchMode in make_fx
-    __torch_function__ = _disabled_torch_function_impl
-
-    @classmethod
-    def _is_supported(cls, op_name):
-        return any([comm in op_name for comm in cls._supported_comms])
-
-    @classmethod
-    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
-        # shared states when unwrapping args
-        tracer = None
-        after_comm = False
-
-        def get_tracer(obj):
-            slots = get_proxy_slots(obj)
-            if slots is None:
-                return None
-            keys = tuple(slots.keys())
-            assert len(keys) == 1
-            return keys[0]
-
-        def get_proxy(obj):
-            slots = get_proxy_slots(obj)
-            if slots is None:
-                return None
-            vals = tuple(slots.values())
-            assert len(vals) == 1
-            return vals[0]
-
-        # wrapped ._tensor if this is a CommTensor, and insert/call wait()
-        # if communication has been launched on this tensor.
-        def unwrap(e):
-            if isinstance(e, CommTensor):
-                nonlocal tracer, after_comm
-
-                after_comm = e._after_comm
-                tracer = get_tracer(e._tensor)
-
-                if after_comm:
-                    if tracer is not None:
-                        # insert a node to the traced graph.
-                        proxy_res = tracer.create_proxy(
-                            'call_function',
-                            wait_comm,
-                            (get_proxy(e._tensor).proxy,),
-                            {},
-                            name="wait_comm"
-                        )
-                        # HACK: update the proxy for the inplace output
-                        set_proxy_slot(e._tensor, tracer, proxy_res)
-                    # For eager mode, simply wait.
-                    # During tracing, still need to wait here, to make sure the
-                    # execution during tracing is correct.
-                    e._work.wait()
-
-                return e._tensor
-            else:
-                return e
-
-        def wrap(e):
-            return CommTensor(e) if isinstance(e, torch.Tensor) else e
-
-        def mark_after_comm(work, e):
-            if isinstance(e, torch.Tensor) or isinstance(e, CommTensor):
-                e._work = work
-                e._after_comm = True
-
-            return e
-
-        unwrapped_args = tree_map(unwrap, args)
-        unwrapped_kwargs = tree_map(unwrap, kwargs)
-
-        if cls._is_supported(func.__name__):
-            if tracer is not None:
-                # in tracing mode, get proxies for args
-                proxy_args, proxy_kwargs = tree_map_only(
-                    _ProxyTensor,
-                    lambda e: e.proxy,
-                    tree_map_only(
-                        torch.Tensor,
-                        fetch_tensor_proxy(tracer),
-                        (unwrapped_args, unwrapped_kwargs)
-                    ),
-                )
-
-                # get proxy for output tuple
-                proxy_res = func(*proxy_args, **proxy_kwargs)
-                # insert a node that wraps the output tuple into
-                # CommResult(tensor, work)
-                comm_result_proxy = tracer.create_proxy(
-                    'call_function',
-                    wrap_comm_result,
-                    (proxy_res, ),
-                    {},
-                    name="comm_result"
-                )
-
-                with no_dispatch():
-                    # disable dispatch to avoid trigger ProxyTorchDispatchMode logic
-                    out = func(*unwrapped_args, **unwrapped_kwargs)
-
-                # wrap output with the proxy of CommResult, so that subsequent
-                # ops and link to it.
-                track_tensor_tree(out, comm_result_proxy, constant=None, tracer=tracer)
-
-                # N.B.: we still need to remember the work handle here, and wait
-                # for it later to make sure the execution during tracing is
-                # correct. Also, remember comm is already launched
-                # args[0] is always the collection of output tensors
-                tree_map(partial(mark_after_comm, out[1]), args[0])
-
-                # HACK: update the proxy on the input argument as this is an
-                # inplace collective communication.
-                flat_args, args_spec = tree_flatten(unwrapped_args[0])
-                flat_out, out_spec = tree_flatten(out[0])
-                for a, o in zip(flat_args, flat_out):
-                    set_proxy_slot(a, tracer, get_proxy(o))
-
-                return out
-            else:
-                # in eager mode, simply remember work handle as an attribute
-                out = func(*unwrapped_args, **kwargs)
-                tree_map(partial(mark_after_comm, out[1]), args[0])
-                return out
-        else:
-            if after_comm:
-                return func(*unwrapped_args, **unwrapped_kwargs)
-            else:
-                # we need to propagate CommTensor wrapping until the first
-                # subsequent operation has waited for it.
-                return tree_map(wrap, func(*unwrapped_args, **unwrapped_kwargs))
-
-
 class CompilerTest(MultiProcessTestCase):
     def setUp(self):
         super(CompilerTest, self).setUp()
@@ -1637,7 +1481,7 @@ def fn(x: torch.Tensor) -> torch.Tensor:
                     curr = prev
                     waited |= all([
                         curr.op == "call_function",
-                        curr.target == wait_comm,
+                        curr.target == _wait_comm,
                     ])
                     commed |= all([
                         curr.op == "call_function",
@@ -1706,6 +1550,22 @@ def comm_fn(tensor, group=None):
 
         self._test_work_wait(tensor, comm_fn=comm_fn)
 
+    def _test_nested_comm_tensor_wrapping(self, tensor):
+        def comm_fn(tensor, group=None):
+            work = dist.all_reduce(CommTensor(tensor), group=group, async_op=True)
+            return work, tensor
+
+        self._test_work_wait(tensor, comm_fn=comm_fn)
+
+    def _test_consecutive_comm_work_wait(self, tensor):
+        def comm_fn(tensor, group=None):
+            work1 = dist.all_reduce(tensor, group=group, async_op=True)
+            work1.wait()
+            work2 = dist.all_reduce(tensor, group=group, async_op=True)
+            return work2, tensor
+
+        self._test_work_wait(tensor, comm_fn=comm_fn)
+
 
 if __name__ == "__main__":
     assert (
diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index 7163d366d4d8..8fb2a8b32433 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -2232,6 +2232,11 @@ def test_forward_backward_optimizer(self):
 
 
 class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
+    @property
+    def device(self):
+        return "cpu"
+
+
     def setUp(self):
         super(CommTest, self).setUp()
         self._spawn_processes()
@@ -2343,6 +2348,15 @@ def test_gloo_warn_not_in_group(self):
     def test_gloo_rank_membership(self):
         self._test_rank_membership(backend="gloo")
 
+    @skip_if_lt_x_gpu(2)
+    @requires_gloo()
+    def test_tensor_dtype_mismatch(self):
+        self._test_tensor_dtype_mismatch(backend="gloo")
+
+    @skip_if_lt_x_gpu(2)
+    @requires_gloo()
+    def test_tensor_dtype_complex(self):
+        self._test_tensor_dtype_complex(backend="gloo")
 
 class CompilerTest(test_c10d_common.CompilerTest):
 
@@ -2396,6 +2410,18 @@ def test_scatter_work_wait_gpu(self):
             torch.ones(2, 2, device=self.rank) * self.rank
         )
 
+    def test_nested_comm_tensor_wrapping(self):
+        self._test_nested_comm_tensor_wrapping(torch.ones(2, 2) * self.rank)
+
+    def test_consecutive_comm_work_wait_cpu(self):
+        self._test_consecutive_comm_work_wait(torch.ones(2, 2) * self.rank)
+
+    @skip_if_lt_x_gpu(2)
+    def test_consecutive_comm_work_wait_gpu(self):
+        self._test_consecutive_comm_work_wait(
+            torch.ones(2, 2, device=self.rank) * self.rank
+        )
+
 
 if __name__ == "__main__":
     assert (
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 9ec7ff6e1e6f..1745703b40cd 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -2548,6 +2548,11 @@ def test_nccl_timeout(self):
 
 
 class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
+    @property
+    def device(self):
+        return f"cuda:{self.rank}"
+
+
     def setUp(self):
         super(CommTest, self).setUp()
         # NCCL_BLOCKING_WAIT overrides NCCL_ASYNC_ERROR_HANDLING hence tests
@@ -2806,6 +2811,16 @@ def test_nccl_warn_not_in_group_debug_off(self):
     def test_nncl_rank_membership(self):
         self._test_rank_membership(backend="nccl")
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    def test_tensor_dtype_mismatch(self):
+        self._test_tensor_dtype_mismatch(backend="nccl")
+
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    def test_tensor_dtype_complex(self):
+        self._test_tensor_dtype_complex(backend="nccl")
+
 
 class CompilerTest(test_c10d_common.CompilerTest):
 
@@ -2853,6 +2868,17 @@ def test_scatter_work_wait_gpu(self):
             torch.ones(2, 2, device=self.rank) * self.rank
         )
 
+    @skip_if_lt_x_gpu(2)
+    def test_nested_comm_tensor_wrapping(self):
+        self._test_nested_comm_tensor_wrapping(
+            torch.ones(2, 2, device=self.rank) * self.rank
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_consecutive_comm_work_wait_gpu(self):
+        self._test_consecutive_comm_work_wait(
+            torch.ones(2, 2, device=self.rank) * self.rank
+        )
 
 if __name__ == "__main__":
     assert (
diff --git a/test/distributed/test_distributed_spawn.py b/test/distributed/test_distributed_spawn.py
index 262737f9dd75..b2a23ff22a9b 100644
--- a/test/distributed/test_distributed_spawn.py
+++ b/test/distributed/test_distributed_spawn.py
@@ -27,7 +27,7 @@
 
 BACKEND = os.environ["BACKEND"]
 
-if BACKEND == "gloo" or BACKEND == "nccl":
+if BACKEND == "gloo" or BACKEND == "nccl" or BACKEND == "ucc":
     class TestDistBackendWithSpawn(TestDistBackend, DistributedTest._DistTestBase):
 
         def setUp(self):
diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index 385420a67813..b6201d4d9e84 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -5062,10 +5062,7 @@ def f(*values):
             xfail = [
                 Cauchy,  # aten::cauchy(Double(2,1), float, float, Generator)
                 HalfCauchy,  # aten::cauchy(Double(2, 1), float, float, Generator)
-                VonMises,  # Variance is not Euclidean
-                Exponential,  # mkl implementation path on intel cpu will produce diffrent results from jit.trace
-                Pareto,  # base_distribution is Exponential
-                Weibull  # base_distribution is Exponential
+                VonMises  # Variance is not Euclidean
             ]
             if Dist in xfail:
                 continue
@@ -5096,9 +5093,6 @@ def f(*values):
             xfail = [
                 Cauchy,  # aten::cauchy(Double(2,1), float, float, Generator)
                 HalfCauchy,  # aten::cauchy(Double(2, 1), float, float, Generator)
-                Exponential,  # mkl implementation path on intel cpu will produce diffrent results from jit.trace
-                Pareto,  # base_distribution is Exponential
-                Weibull  # base_distribution is Exponential
             ]
             if Dist in xfail:
                 continue
diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
index 71560c5c0550..583deeee3f8a 100644
--- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py
+++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
@@ -61,6 +61,8 @@
     ("aten::slice_backward", datetime.date(9999, 1, 1)),
     ("aten::diagonal_backward", datetime.date(9999, 1, 1)),
     ("aten::rowwise_prune", datetime.date(9999, 1, 1)),
+    ("aten::eig", datetime.date(9999, 1, 1)),
+    ("aten::eig.e", datetime.date(9999, 1, 1)),
     ("aten::adaptive_avg_pool3d_backward", datetime.date(9999, 1, 1)),
     ("aten::_embedding_bag_dense_backward", datetime.date(9999, 1, 1)),
     ("aten::randperm", datetime.date(9999, 1, 1)),
@@ -276,6 +278,7 @@
     # Distributed c10d ops are all going to be updated
     ("c10d::.*", datetime.date(2022, 10, 31)),
     ("c10d::allgather_", datetime.date(2022, 10, 1)),
+    ("aten::to_padded_tensor", datetime.date(2022, 10, 1)),
 ]
 
 ALLOW_LIST_COMPILED = [
diff --git a/test/nn/test_pooling.py b/test/nn/test_pooling.py
index 8ad67b83b193..d3fc52f8560c 100644
--- a/test/nn/test_pooling.py
+++ b/test/nn/test_pooling.py
@@ -1086,24 +1086,24 @@ def test_pooling_size_empty(self, device):
         self.assertRaises(RuntimeError, lambda: F.adaptive_max_pool3d(t, []))
 
     def _test_maxpool_indices(self, num_dim, adaptive=False, device="cpu", dtype=torch.float):
-        def expected_indices(dim):
+        def expected_indices(dim, dtype):
             if dim == 1:
-                return torch.tensor([1, 3], dtype=torch.double).repeat(2, 2, 1)
+                return torch.tensor([1, 3], dtype=dtype).repeat(2, 2, 1)
             if dim == 2:
-                return torch.tensor([[5, 7], [13, 15]], dtype=torch.double).repeat(2, 2, 1, 1)
+                return torch.tensor([[5, 7], [13, 15]], dtype=dtype).repeat(2, 2, 1, 1)
 
-        def expected_grad(dim):
+        def expected_grad(dim, dtype):
             if dim == 1:
-                return torch.tensor([0, 1, 0, 1], dtype=torch.double).repeat(2, 2, 1)
-            grad = expected_grad(dim - 1)
-            zero = torch.zeros(grad.size())
+                return torch.tensor([0, 1, 0, 1], dtype=dtype).repeat(2, 2, 1)
+            grad = expected_grad(dim - 1, dtype=dtype)
+            zero = torch.zeros(grad.size(), dtype=dtype)
             return torch.stack((zero, grad, zero, grad), 2)
 
-        def expected_output(dim):
+        def expected_output(dim, dtype):
             if dim == 1:
-                return torch.arange(2, 17, 2).view(2, 2, 2)
+                return torch.arange(2, 17, 2, dtype=dtype).view(2, 2, 2)
             if dim == 2:
-                col = torch.arange(6, 63, 8)
+                col = torch.arange(6, 63, 8, dtype=dtype)
                 return torch.stack([col, col + 2], 1).view(2, 2, 2, 2)
 
         if adaptive:
@@ -1119,22 +1119,19 @@ def expected_output(dim):
         # Check forward
         output, indices = module(input_var)
         if num_dim != 3:
-            expected_indices = expected_indices(num_dim)
-            expected_output = expected_output(num_dim)
+            expected_indices = expected_indices(num_dim, dtype=indices.data.dtype)
+            expected_output = expected_output(num_dim, dtype=output.data.dtype)
             self.assertEqual(indices.dim(), input.dim())
-            # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-            self.assertEqualIgnoreType(indices.data.squeeze(), expected_indices)
-            # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-            self.assertEqualIgnoreType(output.data.squeeze(), expected_output)
+            self.assertEqual(indices.data.squeeze(), expected_indices)
+            self.assertEqual(output.data.squeeze(), expected_output)
         self.assertTrue(output.requires_grad)
         self.assertFalse(indices.requires_grad)
 
         # Make sure backward works
         grad_output = torch.ones(output.size(), device=device, dtype=dtype)
         output.backward(grad_output, retain_graph=True)
-        expected_grad = expected_grad(num_dim)
-        # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-        self.assertEqualIgnoreType(input_var.grad.data, expected_grad.view_as(input))
+        expected_grad = expected_grad(num_dim, dtype=input_var.grad.data.dtype)
+        self.assertEqual(input_var.grad.data, expected_grad.view_as(input))
 
         # Make sure backward after changing indices will result in an error
         indices.add_(1)
@@ -1365,6 +1362,21 @@ def test_pool_large_size(self, device, dtype):
                 # check if the output shape was still computed correctly
                 self.assertEqual(x.shape[2], res.shape[2])
 
+    @onlyCUDA
+    @largeTensorTest('6GB')
+    def test_pooling_large(self, device):
+        def helper(pool):
+            inp = torch.randn(2**7 + 10, 2**8, 2**8, 2**8, dtype=torch.half, device="cuda")
+            self.assertTrue(inp.numel() > 2**31 - 1)
+            out = pool(inp)
+            torch.cuda.synchronize()    # asserts test finishes normally without raising errors
+
+        helper(nn.MaxPool2d(4, 4))
+        helper(nn.AvgPool2d(4, 4))
+        helper(nn.FractionalMaxPool2d(4, 4))
+        helper(nn.AdaptiveMaxPool2d((2**6, 2**6)))
+        helper(nn.AdaptiveAvgPool2d((2**6, 2**6)))
+
     @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16))
     @skipIfMps
     @dtypes(torch.float)
diff --git a/test/onnx/internal/test_registraion.py b/test/onnx/internal/test_registraion.py
new file mode 100644
index 000000000000..2e762051a0d7
--- /dev/null
+++ b/test/onnx/internal/test_registraion.py
@@ -0,0 +1,253 @@
+# Owner(s): ["module: onnx"]
+"""Unit tests for the internal registration wrapper module."""
+
+from typing import Sequence
+
+from torch.onnx import errors
+from torch.onnx._internal import registration
+from torch.testing._internal import common_utils
+
+
+@common_utils.instantiate_parametrized_tests
+class TestGlobalHelpers(common_utils.TestCase):
+    @common_utils.parametrize(
+        "available_opsets, target, expected",
+        [
+            ((7, 8, 9, 10, 11), 16, 11),
+            ((7, 8, 9, 10, 11), 11, 11),
+            ((7, 8, 9, 10, 11), 10, 10),
+            ((7, 8, 9, 10, 11), 9, 9),
+            ((7, 8, 9, 10, 11), 8, 8),
+            ((7, 8, 9, 10, 11), 7, 7),
+            ((9, 10, 16), 16, 16),
+            ((9, 10, 16), 15, 10),
+            ((9, 10, 16), 10, 10),
+            ((9, 10, 16), 9, 9),
+            ((9, 10, 16), 8, 9),
+            ((9, 10, 16), 7, 9),
+            ((7, 9, 10, 16), 16, 16),
+            ((7, 9, 10, 16), 10, 10),
+            ((7, 9, 10, 16), 9, 9),
+            ((7, 9, 10, 16), 8, 9),
+            ((7, 9, 10, 16), 7, 7),
+            ([17], 16, None),  # New op added in 17
+            ([9], 9, 9),
+            ([9], 8, 9),
+            ([], 16, None),
+            ([], 9, None),
+            ([], 8, None),
+            ([8], 16, None),  # Ops lower than 9 are not supported by versions >= 9
+        ],
+    )
+    def test_dispatch_opset_version_returns_correct_version(
+        self, available_opsets: Sequence[int], target: int, expected: int
+    ):
+        actual = registration._dispatch_opset_version(target, available_opsets)
+        self.assertEqual(actual, expected)
+
+
+class TestOverrideDict(common_utils.TestCase):
+    def setUp(self):
+        self.override_dict: registration.OverrideDict[
+            str, int
+        ] = registration.OverrideDict()
+
+    def test_get_item_returns_base_value_when_no_override(self):
+        self.override_dict.set_base("a", 42)
+        self.override_dict.set_base("b", 0)
+
+        self.assertEqual(self.override_dict["a"], 42)
+        self.assertEqual(self.override_dict["b"], 0)
+        self.assertEqual(len(self.override_dict), 2)
+
+    def test_get_item_returns_overridden_value_when_override(self):
+        self.override_dict.set_base("a", 42)
+        self.override_dict.set_base("b", 0)
+        self.override_dict.override("a", 100)
+        self.override_dict.override("c", 1)
+
+        self.assertEqual(self.override_dict["a"], 100)
+        self.assertEqual(self.override_dict["b"], 0)
+        self.assertEqual(self.override_dict["c"], 1)
+        self.assertEqual(len(self.override_dict), 3)
+
+    def test_get_item_raises_key_error_when_not_found(self):
+        self.override_dict.set_base("a", 42)
+
+        with self.assertRaises(KeyError):
+            self.override_dict["nonexistent_key"]
+
+    def test_get_returns_overridden_value_when_override(self):
+        self.override_dict.set_base("a", 42)
+        self.override_dict.set_base("b", 0)
+        self.override_dict.override("a", 100)
+        self.override_dict.override("c", 1)
+
+        self.assertEqual(self.override_dict.get("a"), 100)
+        self.assertEqual(self.override_dict.get("b"), 0)
+        self.assertEqual(self.override_dict.get("c"), 1)
+        self.assertEqual(len(self.override_dict), 3)
+
+    def test_get_returns_none_when_not_found(self):
+        self.override_dict.set_base("a", 42)
+
+        self.assertEqual(self.override_dict.get("nonexistent_key"), None)
+
+    def test_in_base_returns_true_for_base_value(self):
+        self.override_dict.set_base("a", 42)
+        self.override_dict.set_base("b", 0)
+        self.override_dict.override("a", 100)
+        self.override_dict.override("c", 1)
+
+        self.assertIn("a", self.override_dict)
+        self.assertIn("b", self.override_dict)
+        self.assertIn("c", self.override_dict)
+
+        self.assertTrue(self.override_dict.in_base("a"))
+        self.assertTrue(self.override_dict.in_base("b"))
+        self.assertFalse(self.override_dict.in_base("c"))
+        self.assertFalse(self.override_dict.in_base("nonexistent_key"))
+
+    def test_overridden_returns_true_for_overridden_value(self):
+        self.override_dict.set_base("a", 42)
+        self.override_dict.set_base("b", 0)
+        self.override_dict.override("a", 100)
+        self.override_dict.override("c", 1)
+
+        self.assertTrue(self.override_dict.overridden("a"))
+        self.assertFalse(self.override_dict.overridden("b"))
+        self.assertTrue(self.override_dict.overridden("c"))
+        self.assertFalse(self.override_dict.overridden("nonexistent_key"))
+
+    def test_remove_override_removes_overridden_value(self):
+        self.override_dict.set_base("a", 42)
+        self.override_dict.set_base("b", 0)
+        self.override_dict.override("a", 100)
+        self.override_dict.override("c", 1)
+
+        self.assertEqual(self.override_dict["a"], 100)
+        self.assertEqual(self.override_dict["c"], 1)
+
+        self.override_dict.remove_override("a")
+        self.override_dict.remove_override("c")
+        self.assertEqual(self.override_dict["a"], 42)
+        self.assertEqual(self.override_dict.get("c"), None)
+        self.assertFalse(self.override_dict.overridden("a"))
+        self.assertFalse(self.override_dict.overridden("c"))
+
+    def test_remove_override_removes_overridden_key(self):
+        self.override_dict.override("a", 100)
+        self.assertEqual(self.override_dict["a"], 100)
+        self.assertEqual(len(self.override_dict), 1)
+        self.override_dict.remove_override("a")
+        self.assertEqual(len(self.override_dict), 0)
+        self.assertNotIn("a", self.override_dict)
+
+    def test_overriden_key_precededs_base_key_regardless_of_insert_order(self):
+        self.override_dict.set_base("a", 42)
+        self.override_dict.override("a", 100)
+        self.override_dict.set_base("a", 0)
+
+        self.assertEqual(self.override_dict["a"], 100)
+        self.assertEqual(len(self.override_dict), 1)
+
+    def test_bool_is_true_when_not_empty(self):
+        if self.override_dict:
+            self.fail("OverrideDict should be false when empty")
+        self.override_dict.override("a", 1)
+        if not self.override_dict:
+            self.fail("OverrideDict should be true when not empty")
+        self.override_dict.set_base("a", 42)
+        if not self.override_dict:
+            self.fail("OverrideDict should be true when not empty")
+        self.override_dict.remove_override("a")
+        if not self.override_dict:
+            self.fail("OverrideDict should be true when not empty")
+
+
+class TestRegistrationDecorators(common_utils.TestCase):
+    def tearDown(self) -> None:
+        registration.registry._registry.pop("test::test_op", None)
+
+    def test_onnx_symbolic_registers_function(self):
+        self.assertFalse(registration.registry.is_registered_op("test::test_op", 9))
+
+        @registration.onnx_symbolic("test::test_op", opset=9)
+        def test(g, x):
+            return g.op("test", x)
+
+        self.assertTrue(registration.registry.is_registered_op("test::test_op", 9))
+        function_group = registration.registry.get_function_group("test::test_op")
+        assert function_group is not None
+        self.assertEqual(function_group.get(9), test)
+
+    def test_onnx_symbolic_registers_function_applied_decorator_when_provided(self):
+        wrapper_called = False
+
+        def decorator(func):
+            def wrapper(*args, **kwargs):
+                nonlocal wrapper_called
+                wrapper_called = True
+                return func(*args, **kwargs)
+
+            return wrapper
+
+        @registration.onnx_symbolic("test::test_op", opset=9, decorate=[decorator])
+        def test():
+            return
+
+        function_group = registration.registry.get_function_group("test::test_op")
+        assert function_group is not None
+        registered_function = function_group[9]
+        self.assertFalse(wrapper_called)
+        registered_function()
+        self.assertTrue(wrapper_called)
+
+    def test_onnx_symbolic_raises_warning_when_overriding_function(self):
+        self.assertFalse(registration.registry.is_registered_op("test::test_op", 9))
+
+        @registration.onnx_symbolic("test::test_op", opset=9)
+        def test1():
+            return
+
+        with self.assertWarnsRegex(
+            errors.OnnxExporterWarning,
+            "Symbolic function 'test::test_op' already registered",
+        ):
+
+            @registration.onnx_symbolic("test::test_op", opset=9)
+            def test2():
+                return
+
+    def test_custom_onnx_symbolic_registers_custom_function(self):
+        self.assertFalse(registration.registry.is_registered_op("test::test_op", 9))
+
+        @registration.custom_onnx_symbolic("test::test_op", opset=9)
+        def test(g, x):
+            return g.op("test", x)
+
+        self.assertTrue(registration.registry.is_registered_op("test::test_op", 9))
+        function_group = registration.registry.get_function_group("test::test_op")
+        assert function_group is not None
+        self.assertEqual(function_group.get(9), test)
+
+    def test_custom_onnx_symbolic_overrides_existing_function(self):
+        self.assertFalse(registration.registry.is_registered_op("test::test_op", 9))
+
+        @registration.onnx_symbolic("test::test_op", opset=9)
+        def test_original():
+            return "original"
+
+        self.assertTrue(registration.registry.is_registered_op("test::test_op", 9))
+
+        @registration.custom_onnx_symbolic("test::test_op", opset=9)
+        def test_custom():
+            return "custom"
+
+        function_group = registration.registry.get_function_group("test::test_op")
+        assert function_group is not None
+        self.assertEqual(function_group.get(9), test_custom)
+
+
+if __name__ == "__main__":
+    common_utils.run_tests()
diff --git a/test/onnx/onnx_test_common.py b/test/onnx/onnx_test_common.py
index 9822c2dd89c0..45f90d4193ce 100644
--- a/test/onnx/onnx_test_common.py
+++ b/test/onnx/onnx_test_common.py
@@ -61,7 +61,7 @@ def set_rng_seed(seed):
 
 
 class _TestONNXRuntime(common_utils.TestCase):
-    opset_version = _constants.onnx_default_opset
+    opset_version = _constants.ONNX_DEFAULT_OPSET
     keep_initializers_as_inputs = True  # For IR version 3 type export.
     is_script = False
     check_shape = True
diff --git a/test/onnx/test_pytorch_onnx_no_runtime.py b/test/onnx/test_pytorch_onnx_no_runtime.py
index 1a55a74fb724..1a07fd8c0bee 100644
--- a/test/onnx/test_pytorch_onnx_no_runtime.py
+++ b/test/onnx/test_pytorch_onnx_no_runtime.py
@@ -6,6 +6,7 @@
 import io
 import itertools
 import unittest
+import unittest.mock
 from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import onnx
@@ -14,14 +15,15 @@
 import torch
 import torch.nn.functional as F
 from torch import Tensor
-from torch.onnx import symbolic_helper, symbolic_registry, utils
+from torch.onnx import symbolic_helper, utils
 from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import registration
 from torch.testing._internal import common_utils
 
 
 def export_to_onnx(
     model: Union[torch.nn.Module, torch.jit.ScriptFunction],
-    input: Tuple[torch.Tensor],
+    input: Union[torch.Tensor, Tuple[torch.Tensor]],
     custom_ops: Optional[
         Iterable[
             Union[contextlib.AbstractContextManager, contextlib.ContextDecorator],
@@ -436,14 +438,11 @@ class MyClip(torch.nn.Module):
             def forward(self, x):
                 return torch.clamp(x, min=-0.5, max=0.5)
 
-        def break_is_registered_op_api(opname, domain, version):
-            fake_missing_symbolics = ("clamp",)
-            if opname in fake_missing_symbolics:
-                return False
-            return (
-                (domain, version) in symbolic_registry._registry
-                and opname in symbolic_registry._registry[(domain, version)]
-            )
+        def break_is_registered_op_api(name):
+            fake_missing_symbolics = {"aten::clamp"}
+            if name in fake_missing_symbolics:
+                return None
+            return registration.registry.get_function_group(name)
 
         # Force missing symbolic for well-known op using a mock
         onnx_model = export_to_onnx(
@@ -451,7 +450,7 @@ def break_is_registered_op_api(opname, domain, version):
             torch.randn(3, 4, requires_grad=True),
             mocks=[
                 unittest.mock.patch(
-                    "torch.onnx.symbolic_registry.is_registered_op",
+                    "torch.onnx._internal.registration.registry.get_function_group",
                     side_effect=break_is_registered_op_api,
                 )
             ],
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 42278711817e..7970bf4d6d6a 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -44,7 +44,7 @@
 # The min onnx opset version to test for
 MIN_ONNX_OPSET_VERSION = 9
 # The max onnx opset version to test for
-MAX_ONNX_OPSET_VERSION = _constants.onnx_main_opset
+MAX_ONNX_OPSET_VERSION = _constants.ONNX_MAX_OPSET
 
 
 def _init_test_generalized_rcnn_transform():
diff --git a/test/onnx/test_pytorch_onnx_shape_inference.py b/test/onnx/test_pytorch_onnx_shape_inference.py
index 516ac2cb6cd7..c8db1bec5f77 100644
--- a/test/onnx/test_pytorch_onnx_shape_inference.py
+++ b/test/onnx/test_pytorch_onnx_shape_inference.py
@@ -21,7 +21,7 @@ def verify(actual_type):
 
 class TestONNXShapeInference(common_utils.TestCase):
     def setUp(self):
-        self.opset_version = _constants.onnx_main_opset
+        self.opset_version = _constants.ONNX_MAX_OPSET
         symbolic_helper._set_onnx_shape_inference(True)
         symbolic_helper._set_opset_version(self.opset_version)
 
diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py
index 074ee59d31f8..3650a0e7ea9a 100644
--- a/test/onnx/test_utility_funs.py
+++ b/test/onnx/test_utility_funs.py
@@ -2,7 +2,6 @@
 
 import copy
 import io
-import unittest
 
 import onnx
 
@@ -1154,16 +1153,15 @@ def gelu(g, self, approximate):
         self.assertEqual(graph.opset_import[1].domain, "com.microsoft")
 
     @skipIfNoLapack
-    @unittest.skip("It started failing after #80074")
     def test_custom_opsets_inverse(self):
         class CustomInverse(torch.nn.Module):
             def forward(self, x):
                 return torch.inverse(x) + x
 
-        def inverse(g, self):
+        def linalg_inv(g, self):
             return g.op("com.microsoft::Inverse", self).setType(self.type())
 
-        register_custom_op_symbolic("::inverse", inverse, 1)
+        register_custom_op_symbolic("::linalg_inv", linalg_inv, 1)
         model = CustomInverse()
         x = torch.randn(2, 3, 3)
         f = io.BytesIO()
diff --git a/test/profiler/profiler_utils_mock_events.json b/test/profiler/profiler_utils_mock_events.json
new file mode 100644
index 000000000000..80d40a67bc01
--- /dev/null
+++ b/test/profiler/profiler_utils_mock_events.json
@@ -0,0 +1 @@
+[[{"_name": "aten::matmul", "_start_us": 1656454173440014, "_duration_us": 2254, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::mm", "_start_us": 1656454173440019, "_duration_us": 2246, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::matmul", "_start_us": 1656454173442289, "_duration_us": 33, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::mm", "_start_us": 1656454173442291, "_duration_us": 30, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::matmul", "_start_us": 1656454173442325, "_duration_us": 32, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::mm", "_start_us": 1656454173442326, "_duration_us": 30, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::matmul", "_start_us": 1656454173442360, "_duration_us": 21, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::mm", "_start_us": 1656454173442361, "_duration_us": 19, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::matmul", "_start_us": 1656454173442384, "_duration_us": 21, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::mm", "_start_us": 1656454173442385, "_duration_us": 20, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::select", "_start_us": 1656454173444252, "_duration_us": 38, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::as_strided", "_start_us": 1656454173444282, "_duration_us": 4, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::select", "_start_us": 1656454173444291, "_duration_us": 9, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::as_strided", "_start_us": 1656454173444296, "_duration_us": 1, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::copy_", "_start_us": 1656454173444305, "_duration_us": 45427, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::select", "_start_us": 1656454173489760, "_duration_us": 5, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::as_strided", "_start_us": 1656454173489764, "_duration_us": 0, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::select", "_start_us": 1656454173489766, "_duration_us": 3, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::as_strided", "_start_us": 1656454173489767, "_duration_us": 1, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::copy_", "_start_us": 1656454173489771, "_duration_us": 35, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::select", "_start_us": 1656454173489811, "_duration_us": 2, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::as_strided", "_start_us": 1656454173489812, "_duration_us": 1, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::select", "_start_us": 1656454173489814, "_duration_us": 2, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::as_strided", "_start_us": 1656454173489815, "_duration_us": 0, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::copy_", "_start_us": 1656454173489817, "_duration_us": 21, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::select", "_start_us": 1656454173489842, "_duration_us": 3, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::as_strided", "_start_us": 1656454173489844, "_duration_us": 0, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::select", "_start_us": 1656454173489846, "_duration_us": 1, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::as_strided", "_start_us": 1656454173489847, "_duration_us": 0, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::copy_", "_start_us": 1656454173489848, "_duration_us": 21, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::select", "_start_us": 1656454173489873, "_duration_us": 2, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::as_strided", "_start_us": 1656454173489874, "_duration_us": 0, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::select", "_start_us": 1656454173489875, "_duration_us": 2, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::as_strided", "_start_us": 1656454173489876, "_duration_us": 1, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::copy_", "_start_us": 1656454173489878, "_duration_us": 20, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::matmul", "_start_us": 1656454173489912, "_duration_us": 104, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::mm", "_start_us": 1656454173489916, "_duration_us": 99, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::matmul", "_start_us": 1656454173490026, "_duration_us": 25, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::mm", "_start_us": 1656454173490027, "_duration_us": 23, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::matmul", "_start_us": 1656454173490054, "_duration_us": 34, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::mm", "_start_us": 1656454173490055, "_duration_us": 32, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::matmul", "_start_us": 1656454173490091, "_duration_us": 21, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::mm", "_start_us": 1656454173490092, "_duration_us": 20, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::matmul", "_start_us": 1656454173490115, "_duration_us": 22, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::mm", "_start_us": 1656454173490116, "_duration_us": 20, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "_start_us": 1656454173441289, "_duration_us": 2, "_linked_correlation_id": 3074, "_device_type": 0}, {"_name": "ampere_sgemm_128x64_nn", "_start_us": 1656454173443225, "_duration_us": 9296, "_linked_correlation_id": 3074, "_device_type": 1}, {"_name": "cudaLaunchKernel", "_start_us": 1656454173441296, "_duration_us": 963, "_linked_correlation_id": 3074, "_device_type": 0}, {"_name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "_start_us": 1656454173442309, "_duration_us": 1, "_linked_correlation_id": 3076, "_device_type": 0}, {"_name": "ampere_sgemm_128x64_nn", "_start_us": 1656454173452523, "_duration_us": 9296, "_linked_correlation_id": 3076, "_device_type": 1}, {"_name": "cudaLaunchKernel", "_start_us": 1656454173442312, "_duration_us": 7, "_linked_correlation_id": 3076, "_device_type": 0}, {"_name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "_start_us": 1656454173442346, "_duration_us": 0, "_linked_correlation_id": 3078, "_device_type": 0}, {"_name": "ampere_sgemm_128x64_nn", "_start_us": 1656454173461821, "_duration_us": 9293, "_linked_correlation_id": 3078, "_device_type": 1}, {"_name": "cudaLaunchKernel", "_start_us": 1656454173442348, "_duration_us": 6, "_linked_correlation_id": 3078, "_device_type": 0}, {"_name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "_start_us": 1656454173442371, "_duration_us": 0, "_linked_correlation_id": 3080, "_device_type": 0}, {"_name": "ampere_sgemm_128x64_nn", "_start_us": 1656454173471117, "_duration_us": 9295, "_linked_correlation_id": 3080, "_device_type": 1}, {"_name": "cudaLaunchKernel", "_start_us": 1656454173442373, "_duration_us": 5, "_linked_correlation_id": 3080, "_device_type": 0}, {"_name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "_start_us": 1656454173442395, "_duration_us": 0, "_linked_correlation_id": 3082, "_device_type": 0}, {"_name": "ampere_sgemm_128x64_nn", "_start_us": 1656454173480414, "_duration_us": 9297, "_linked_correlation_id": 3082, "_device_type": 1}, {"_name": "cudaLaunchKernel", "_start_us": 1656454173442397, "_duration_us": 6, "_linked_correlation_id": 3082, "_device_type": 0}, {"_name": "Memcpy HtoD (Pageable -> Device)", "_start_us": 1656454173489715, "_duration_us": 2, "_linked_correlation_id": 3087, "_device_type": 1}, {"_name": "cudaMemcpyAsync", "_start_us": 1656454173444325, "_duration_us": 24, "_linked_correlation_id": 3087, "_device_type": 0}, {"_name": "cudaStreamSynchronize", "_start_us": 1656454173444350, "_duration_us": 45377, "_linked_correlation_id": 3087, "_device_type": 0}, {"_name": "Memcpy HtoD (Pageable -> Device)", "_start_us": 1656454173489796, "_duration_us": 2, "_linked_correlation_id": 3092, "_device_type": 1}, {"_name": "cudaMemcpyAsync", "_start_us": 1656454173489777, "_duration_us": 14, "_linked_correlation_id": 3092, "_device_type": 0}, {"_name": "cudaStreamSynchronize", "_start_us": 1656454173489791, "_duration_us": 13, "_linked_correlation_id": 3092, "_device_type": 0}, {"_name": "Memcpy HtoD (Pageable -> Device)", "_start_us": 1656454173489828, "_duration_us": 2, "_linked_correlation_id": 3097, "_device_type": 1}, {"_name": "cudaMemcpyAsync", "_start_us": 1656454173489820, "_duration_us": 3, "_linked_correlation_id": 3097, "_device_type": 0}, {"_name": "cudaStreamSynchronize", "_start_us": 1656454173489824, "_duration_us": 13, "_linked_correlation_id": 3097, "_device_type": 0}, {"_name": "Memcpy HtoD (Pageable -> Device)", "_start_us": 1656454173489859, "_duration_us": 2, "_linked_correlation_id": 3102, "_device_type": 1}, {"_name": "cudaMemcpyAsync", "_start_us": 1656454173489851, "_duration_us": 3, "_linked_correlation_id": 3102, "_device_type": 0}, {"_name": "cudaStreamSynchronize", "_start_us": 1656454173489854, "_duration_us": 13, "_linked_correlation_id": 3102, "_device_type": 0}, {"_name": "Memcpy HtoD (Pageable -> Device)", "_start_us": 1656454173489889, "_duration_us": 2, "_linked_correlation_id": 3107, "_device_type": 1}, {"_name": "cudaMemcpyAsync", "_start_us": 1656454173489880, "_duration_us": 3, "_linked_correlation_id": 3107, "_device_type": 0}, {"_name": "cudaStreamSynchronize", "_start_us": 1656454173489884, "_duration_us": 12, "_linked_correlation_id": 3107, "_device_type": 0}, {"_name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "_start_us": 1656454173489972, "_duration_us": 3, "_linked_correlation_id": 3109, "_device_type": 0}, {"_name": "ampere_sgemm_128x64_nn", "_start_us": 1656454173490013, "_duration_us": 9302, "_linked_correlation_id": 3109, "_device_type": 1}, {"_name": "cudaLaunchKernel", "_start_us": 1656454173489980, "_duration_us": 32, "_linked_correlation_id": 3109, "_device_type": 0}, {"_name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "_start_us": 1656454173490040, "_duration_us": 0, "_linked_correlation_id": 3111, "_device_type": 0}, {"_name": "ampere_sgemm_128x64_nn", "_start_us": 1656454173499317, "_duration_us": 9306, "_linked_correlation_id": 3111, "_device_type": 1}, {"_name": "cudaLaunchKernel", "_start_us": 1656454173490042, "_duration_us": 7, "_linked_correlation_id": 3111, "_device_type": 0}, {"_name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "_start_us": 1656454173490076, "_duration_us": 0, "_linked_correlation_id": 3113, "_device_type": 0}, {"_name": "ampere_sgemm_128x64_nn", "_start_us": 1656454173508625, "_duration_us": 9299, "_linked_correlation_id": 3113, "_device_type": 1}, {"_name": "cudaLaunchKernel", "_start_us": 1656454173490078, "_duration_us": 7, "_linked_correlation_id": 3113, "_device_type": 0}, {"_name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "_start_us": 1656454173490102, "_duration_us": 0, "_linked_correlation_id": 3115, "_device_type": 0}, {"_name": "ampere_sgemm_128x64_nn", "_start_us": 1656454173517925, "_duration_us": 9300, "_linked_correlation_id": 3115, "_device_type": 1}, {"_name": "cudaLaunchKernel", "_start_us": 1656454173490104, "_duration_us": 5, "_linked_correlation_id": 3115, "_device_type": 0}, {"_name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "_start_us": 1656454173490126, "_duration_us": 0, "_linked_correlation_id": 3117, "_device_type": 0}, {"_name": "ampere_sgemm_128x64_nn", "_start_us": 1656454173527228, "_duration_us": 9301, "_linked_correlation_id": 3117, "_device_type": 1}, {"_name": "cudaLaunchKernel", "_start_us": 1656454173490128, "_duration_us": 6, "_linked_correlation_id": 3117, "_device_type": 0}, {"_name": "cudaDeviceSynchronize", "_start_us": 1656454173490313, "_duration_us": 46225, "_linked_correlation_id": 0, "_device_type": 0}], [{"_name": "profiler/test_profiler.py(1435): <module>", "id": 94242239505696, "start_time_ns": 1656454173436288169, "duration_time_ns": 7566917863418487639, "correlation_id": 0, "children": [94242238082288], "parent": null}, {"_name": "torch/testing/_internal/common_utils.py(697): run_tests", "id": 94242238082288, "start_time_ns": 1656454173438182431, "duration_time_ns": 7566917863416593377, "correlation_id": 0, "children": [94242238082800], "parent": 94242239505696}, {"_name": "unittest/main.py(101): __init__", "id": 94242238082800, "start_time_ns": 1656454173438184159, "duration_time_ns": 7566917863416591649, "correlation_id": 0, "children": [94242238083184], "parent": 94242238082288}, {"_name": "unittest/main.py(271): runTests", "id": 94242238083184, "start_time_ns": 1656454173438186629, "duration_time_ns": 7566917863416589179, "correlation_id": 0, "children": [94242238083568], "parent": 94242238082800}, {"_name": "unittest/runner.py(184): run", "id": 94242238083568, "start_time_ns": 1656454173438187601, "duration_time_ns": 7566917863416588207, "correlation_id": 0, "children": [94242238084128], "parent": 94242238083184}, {"_name": "unittest/suite.py(84): __call__", "id": 94242238084128, "start_time_ns": 1656454173438189531, "duration_time_ns": 7566917863416586277, "correlation_id": 0, "children": [94242238084544], "parent": 94242238083568}, {"_name": "unittest/suite.py(122): run", "id": 94242238084544, "start_time_ns": 1656454173438190205, "duration_time_ns": 7566917863416585603, "correlation_id": 0, "children": [94242238084960], "parent": 94242238084128}, {"_name": "unittest/suite.py(84): __call__", "id": 94242238084960, "start_time_ns": 1656454173438191228, "duration_time_ns": 7566917863416584580, "correlation_id": 0, "children": [94242238085376], "parent": 94242238084544}, {"_name": "unittest/suite.py(122): run", "id": 94242238085376, "start_time_ns": 1656454173438191346, "duration_time_ns": 7566917863416584462, "correlation_id": 0, "children": [94242238085792], "parent": 94242238084960}, {"_name": "unittest/case.py(651): __call__", "id": 94242238085792, "start_time_ns": 1656454173438191484, "duration_time_ns": 7566917863416584324, "correlation_id": 0, "children": [94242239133216], "parent": 94242238085376}, {"_name": "torch/testing/_internal/common_utils.py(1886): run", "id": 94242239133216, "start_time_ns": 1656454173438195759, "duration_time_ns": 7566917863416580049, "correlation_id": 0, "children": [94242239133632], "parent": 94242238085792}, {"_name": "torch/testing/_internal/common_utils.py(1829): _run_with_retry", "id": 94242239133632, "start_time_ns": 1656454173438197353, "duration_time_ns": 7566917863416578455, "correlation_id": 0, "children": [94242239134048], "parent": 94242239133216}, {"_name": "unittest/case.py(592): run", "id": 94242239134048, "start_time_ns": 1656454173438198172, "duration_time_ns": 7566917863416577636, "correlation_id": 0, "children": [94242239134464], "parent": 94242239133632}, {"_name": "unittest/case.py(550): _callTestMethod", "id": 94242239134464, "start_time_ns": 1656454173438211703, "duration_time_ns": 7566917863416564105, "correlation_id": 0, "children": [94242239134880], "parent": 94242239134048}, {"_name": "profiler/test_profiler.py(1420): test_utils_get_optimizable_events", "id": 94242239134880, "start_time_ns": 1656454173438759703, "duration_time_ns": 7566917863416016105, "correlation_id": 0, "children": [94242239135296], "parent": 94242239134464}, {"_name": "profiler/test_profiler.py(1251): load_mock_profile", "id": 94242239135296, "start_time_ns": 1656454173438760534, "duration_time_ns": 7566917863416015274, "correlation_id": 0, "children": [94242239135712, 94240979270032, 94240979295904, 94240979389888, 94240979327296, 94242239499936, 94242239139040, 94242239299696, 94242239301040, 94242239302384, 94242239303728, 94242239305072, 94242239139456], "parent": 94242239134880}, {"_name": "torch/profiler/profiler.py(475): __exit__", "id": 94242239139456, "start_time_ns": 1656454173490143177, "duration_time_ns": 7566917863364632631, "correlation_id": 0, "children": [94242239139872], "parent": 94242239135296}, {"_name": "torch/profiler/profiler.py(484): stop", "id": 94242239139872, "start_time_ns": 1656454173490151443, "duration_time_ns": 7566917863364624365, "correlation_id": 0, "children": [94242239140288], "parent": 94242239139456}, {"_name": "torch/profiler/profiler.py(511): _transit_action", "id": 94242239140288, "start_time_ns": 1656454173490160200, "duration_time_ns": 7566917863364615608, "correlation_id": 0, "children": [94242238898288, 94242238886608], "parent": 94242239139872}, {"_name": "torch/profiler/profiler.py(117): stop_trace", "id": 94242238886608, "start_time_ns": 1656454173490212930, "duration_time_ns": 7566917863364562878, "correlation_id": 0, "children": [94242238887024], "parent": 94242239140288}, {"_name": "torch/autograd/profiler.py(207): __exit__", "id": 94242238887024, "start_time_ns": 1656454173490216323, "duration_time_ns": 7566917863364559485, "correlation_id": 0, "children": [94242238887440], "parent": 94242238886608}, {"_name": "torch/cuda/__init__.py(486): synchronize", "id": 94242238887440, "start_time_ns": 1656454173490222710, "duration_time_ns": 7566917863364553098, "correlation_id": 0, "children": [94242238887856, 94242238888688, 94242238894096, 94242239121280, 94242238897008], "parent": 94242238887024}, {"_name": "torch/cuda/__init__.py(281): __exit__", "id": 94242238897008, "start_time_ns": 1656454173536540711, "duration_time_ns": 7566917863318235097, "correlation_id": 0, "children": [94242239121696], "parent": 94242238887440}, {"_name": "<built-in method _disable_profiler of PyCapsule object at 0x7fa02685d210>", "id": 94242239121696, "start_time_ns": 1656454173536553153, "duration_time_ns": 7566917863318222655, "correlation_id": 0, "children": [], "parent": 94242238897008}, {"_name": "<built-in function _cuda_synchronize>", "id": 94242239121280, "start_time_ns": 1656454173490312079, "duration_time_ns": 46227101, "correlation_id": 0, "children": [], "parent": 94242238887440}, {"_name": "torch/cuda/__init__.py(272): __enter__", "id": 94242238894096, "start_time_ns": 1656454173490303577, "duration_time_ns": 5394, "correlation_id": 0, "children": [94242238894512, 94242238895760, 94242238896176], "parent": 94242238887440}, {"_name": "torch/cuda/__init__.py(191): _lazy_init", "id": 94242238896176, "start_time_ns": 1656454173490308123, "duration_time_ns": 792, "correlation_id": 0, "children": [94242238896592], "parent": 94242238894096}, {"_name": "torch/cuda/__init__.py(149): is_initialized", "id": 94242238896592, "start_time_ns": 1656454173490308633, "duration_time_ns": 242, "correlation_id": 0, "children": [94242239120864], "parent": 94242238896176}, {"_name": "<built-in function _cuda_isInBadFork>", "id": 94242239120864, "start_time_ns": 1656454173490308734, "duration_time_ns": 121, "correlation_id": 0, "children": [], "parent": 94242238896592}, {"_name": "torch/_jit_internal.py(982): is_scripting", "id": 94242238895760, "start_time_ns": 1656454173490307337, "duration_time_ns": 660, "correlation_id": 0, "children": [], "parent": 94242238894096}, {"_name": "torch/cuda/__init__.py(480): current_device", "id": 94242238894512, "start_time_ns": 1656454173490304532, "duration_time_ns": 2250, "correlation_id": 0, "children": [94242238894928, 94242239120448], "parent": 94242238894096}, {"_name": "<built-in function _cuda_getDevice>", "id": 94242239120448, "start_time_ns": 1656454173490305817, "duration_time_ns": 934, "correlation_id": 0, "children": [], "parent": 94242238894512}, {"_name": "torch/cuda/__init__.py(191): _lazy_init", "id": 94242238894928, "start_time_ns": 1656454173490305205, "duration_time_ns": 400, "correlation_id": 0, "children": [94242238895344], "parent": 94242238894512}, {"_name": "torch/cuda/__init__.py(149): is_initialized", "id": 94242238895344, "start_time_ns": 1656454173490305315, "duration_time_ns": 249, "correlation_id": 0, "children": [94242239120032], "parent": 94242238894928}, {"_name": "<built-in function _cuda_isInBadFork>", "id": 94242239120032, "start_time_ns": 1656454173490305469, "duration_time_ns": 64, "correlation_id": 0, "children": [], "parent": 94242238895344}, {"_name": "torch/cuda/__init__.py(268): __init__", "id": 94242238888688, "start_time_ns": 1656454173490238187, "duration_time_ns": 63856, "correlation_id": 0, "children": [94242238889104], "parent": 94242238887440}, {"_name": "torch/cuda/_utils.py(7): _get_device_index", "id": 94242238889104, "start_time_ns": 1656454173490241229, "duration_time_ns": 59393, "correlation_id": 0, "children": [94242239113392, 94242239113808, 94242238889520, 94242239114224, 94242238889936], "parent": 94242238888688}, {"_name": "torch/_utils.py(521): _get_device_index", "id": 94242238889936, "start_time_ns": 1656454173490254695, "duration_time_ns": 45728, "correlation_id": 0, "children": [94242239114640, 94242239115056, 94242239117536, 94242238890352, 94242238890768], "parent": 94242238889104}, {"_name": "torch/_utils.py(497): _get_current_device_index", "id": 94242238890768, "start_time_ns": 1656454173490269804, "duration_time_ns": 30489, "correlation_id": 0, "children": [94242238891184], "parent": 94242238889936}, {"_name": "torch/_utils.py(487): _get_device_attr", "id": 94242238891184, "start_time_ns": 1656454173490277921, "duration_time_ns": 22112, "correlation_id": 0, "children": [94242238891600, 94242239118784, 94242238892432], "parent": 94242238890768}, {"_name": "torch/_utils.py(499): <lambda>", "id": 94242238892432, "start_time_ns": 1656454173490290622, "duration_time_ns": 9269, "correlation_id": 0, "children": [94242238892848], "parent": 94242238891184}, {"_name": "torch/cuda/__init__.py(480): current_device", "id": 94242238892848, "start_time_ns": 1656454173490292572, "duration_time_ns": 7253, "correlation_id": 0, "children": [94242238893264, 94242239119616], "parent": 94242238892432}, {"_name": "<built-in function _cuda_getDevice>", "id": 94242239119616, "start_time_ns": 1656454173490296196, "duration_time_ns": 3565, "correlation_id": 0, "children": [], "parent": 94242238892848}, {"_name": "torch/cuda/__init__.py(191): _lazy_init", "id": 94242238893264, "start_time_ns": 1656454173490293743, "duration_time_ns": 1072, "correlation_id": 0, "children": [94242238893680], "parent": 94242238892848}, {"_name": "torch/cuda/__init__.py(149): is_initialized", "id": 94242238893680, "start_time_ns": 1656454173490294339, "duration_time_ns": 402, "correlation_id": 0, "children": [94242239119200], "parent": 94242238893264}, {"_name": "<built-in function _cuda_isInBadFork>", "id": 94242239119200, "start_time_ns": 1656454173490294551, "duration_time_ns": 124, "correlation_id": 0, "children": [], "parent": 94242238893680}, {"_name": "<built-in method get of dict object at 0x7fa01c47d700>", "id": 94242239118784, "start_time_ns": 1656454173490289374, "duration_time_ns": 241, "correlation_id": 0, "children": [], "parent": 94242238891184}, {"_name": "torch/_utils.py(478): _get_available_device_type", "id": 94242238891600, "start_time_ns": 1656454173490280148, "duration_time_ns": 8003, "correlation_id": 0, "children": [94242238892016], "parent": 94242238891184}, {"_name": "torch/cuda/__init__.py(77): is_available", "id": 94242238892016, "start_time_ns": 1656454173490282141, "duration_time_ns": 5804, "correlation_id": 0, "children": [94242239117952, 94242239118368], "parent": 94242238891600}, {"_name": "<built-in function _cuda_getDeviceCount>", "id": 94242239118368, "start_time_ns": 1656454173490286599, "duration_time_ns": 1061, "correlation_id": 0, "children": [], "parent": 94242238892016}, {"_name": "<built-in function hasattr>", "id": 94242239117952, "start_time_ns": 1656454173490284307, "duration_time_ns": 988, "correlation_id": 0, "children": [], "parent": 94242238892016}, {"_name": "torch/_jit_internal.py(982): is_scripting", "id": 94242238890352, "start_time_ns": 1656454173490268636, "duration_time_ns": 383, "correlation_id": 0, "children": [], "parent": 94242238889936}, {"_name": "<built-in function isinstance>", "id": 94242239117536, "start_time_ns": 1656454173490268135, "duration_time_ns": 45, "correlation_id": 0, "children": [], "parent": 94242238889936}, {"_name": "<built-in function isinstance>", "id": 94242239115056, "start_time_ns": 1656454173490266016, "duration_time_ns": 43, "correlation_id": 0, "children": [], "parent": 94242238889936}, {"_name": "<built-in function isinstance>", "id": 94242239114640, "start_time_ns": 1656454173490264843, "duration_time_ns": 71, "correlation_id": 0, "children": [], "parent": 94242238889936}, {"_name": "<built-in function isinstance>", "id": 94242239114224, "start_time_ns": 1656454173490253455, "duration_time_ns": 56, "correlation_id": 0, "children": [], "parent": 94242238889104}, {"_name": "torch/_jit_internal.py(982): is_scripting", "id": 94242238889520, "start_time_ns": 1656454173490250344, "duration_time_ns": 2192, "correlation_id": 0, "children": [], "parent": 94242238889104}, {"_name": "<built-in function isinstance>", "id": 94242239113808, "start_time_ns": 1656454173490247257, "duration_time_ns": 104, "correlation_id": 0, "children": [], "parent": 94242238889104}, {"_name": "<built-in function isinstance>", "id": 94242239113392, "start_time_ns": 1656454173490245162, "duration_time_ns": 807, "correlation_id": 0, "children": [], "parent": 94242238889104}, {"_name": "torch/cuda/__init__.py(191): _lazy_init", "id": 94242238887856, "start_time_ns": 1656454173490224967, "duration_time_ns": 10586, "correlation_id": 0, "children": [94242238888272], "parent": 94242238887440}, {"_name": "torch/cuda/__init__.py(149): is_initialized", "id": 94242238888272, "start_time_ns": 1656454173490227128, "duration_time_ns": 8241, "correlation_id": 0, "children": [94242239113008], "parent": 94242238887856}, {"_name": "<built-in function _cuda_isInBadFork>", "id": 94242239113008, "start_time_ns": 1656454173490234177, "duration_time_ns": 892, "correlation_id": 0, "children": [], "parent": 94242238888272}, {"_name": "<built-in method get of dict object at 0x7fa01c47d700>", "id": 94242238898288, "start_time_ns": 1656454173490187641, "duration_time_ns": 9517, "correlation_id": 0, "children": [94242239140704], "parent": 94242239140288}, {"_name": "enum.py(774): __hash__", "id": 94242239140704, "start_time_ns": 1656454173490190439, "duration_time_ns": 5319, "correlation_id": 0, "children": [94242239112592], "parent": 94242238898288}, {"_name": "<built-in function hash>", "id": 94242239112592, "start_time_ns": 1656454173490194870, "duration_time_ns": 721, "correlation_id": 0, "children": [], "parent": 94242239140704}, {"_name": "aten::matmul", "id": 94242239305072, "start_time_ns": 1656454173490115971, "duration_time_ns": 21513, "correlation_id": 3116, "children": [94242239305744], "parent": 94242239135296}, {"_name": "aten::mm", "id": 94242239305744, "start_time_ns": 1656454173490116650, "duration_time_ns": 20114, "correlation_id": 3117, "children": [], "parent": 94242239305072}, {"_name": "aten::matmul", "id": 94242239303728, "start_time_ns": 1656454173490091388, "duration_time_ns": 21342, "correlation_id": 3114, "children": [94242239304400], "parent": 94242239135296}, {"_name": "aten::mm", "id": 94242239304400, "start_time_ns": 1656454173490092214, "duration_time_ns": 19792, "correlation_id": 3115, "children": [], "parent": 94242239303728}, {"_name": "aten::matmul", "id": 94242239302384, "start_time_ns": 1656454173490054842, "duration_time_ns": 33225, "correlation_id": 3112, "children": [94242239303056], "parent": 94242239135296}, {"_name": "aten::mm", "id": 94242239303056, "start_time_ns": 1656454173490055485, "duration_time_ns": 31894, "correlation_id": 3113, "children": [], "parent": 94242239302384}, {"_name": "aten::matmul", "id": 94242239301040, "start_time_ns": 1656454173490026585, "duration_time_ns": 24997, "correlation_id": 3110, "children": [94242239301712], "parent": 94242239135296}, {"_name": "aten::mm", "id": 94242239301712, "start_time_ns": 1656454173490027380, "duration_time_ns": 23566, "correlation_id": 3111, "children": [], "parent": 94242239301040}, {"_name": "aten::matmul", "id": 94242239299696, "start_time_ns": 1656454173489912600, "duration_time_ns": 104156, "correlation_id": 3108, "children": [94242239300368], "parent": 94242239135296}, {"_name": "aten::mm", "id": 94242239300368, "start_time_ns": 1656454173489916106, "duration_time_ns": 99633, "correlation_id": 3109, "children": [], "parent": 94242239299696}, {"_name": "profiler/test_profiler.py(1245): garbage_code", "id": 94242239139040, "start_time_ns": 1656454173442410244, "duration_time_ns": 47490938, "correlation_id": 0, "children": [94242239501088, 94242239482560, 94242239483728, 94242239484368, 94242239485536, 94242239486912, 94242239487552, 94242237992320, 94242237993696, 94242237994336, 94242237995712, 94242237997120, 94242237997856, 94242237999328, 94242238000800], "parent": 94242239135296}, {"_name": "aten::copy_", "id": 94242238000800, "start_time_ns": 1656454173489878232, "duration_time_ns": 20288, "correlation_id": 3107, "children": [], "parent": 94242239139040}, {"_name": "aten::select", "id": 94242237999328, "start_time_ns": 1656454173489875969, "duration_time_ns": 1490, "correlation_id": 3105, "children": [94242238000240], "parent": 94242239139040}, {"_name": "aten::as_strided", "id": 94242238000240, "start_time_ns": 1656454173489876749, "duration_time_ns": 269, "correlation_id": 3106, "children": [], "parent": 94242237999328}, {"_name": "aten::select", "id": 94242237997856, "start_time_ns": 1656454173489873022, "duration_time_ns": 2173, "correlation_id": 3103, "children": [94242237998768], "parent": 94242239139040}, {"_name": "aten::as_strided", "id": 94242237998768, "start_time_ns": 1656454173489874129, "duration_time_ns": 436, "correlation_id": 3104, "children": [], "parent": 94242237997856}, {"_name": "aten::copy_", "id": 94242237997120, "start_time_ns": 1656454173489848771, "duration_time_ns": 20290, "correlation_id": 3102, "children": [], "parent": 94242239139040}, {"_name": "aten::select", "id": 94242237995712, "start_time_ns": 1656454173489846145, "duration_time_ns": 1571, "correlation_id": 3100, "children": [94242237996560], "parent": 94242239139040}, {"_name": "aten::as_strided", "id": 94242237996560, "start_time_ns": 1656454173489847021, "duration_time_ns": 204, "correlation_id": 3101, "children": [], "parent": 94242237995712}, {"_name": "aten::select", "id": 94242237994336, "start_time_ns": 1656454173489842325, "duration_time_ns": 3114, "correlation_id": 3098, "children": [94242237995184], "parent": 94242239139040}, {"_name": "aten::as_strided", "id": 94242237995184, "start_time_ns": 1656454173489844409, "duration_time_ns": 440, "correlation_id": 3099, "children": [], "parent": 94242237994336}, {"_name": "aten::copy_", "id": 94242237993696, "start_time_ns": 1656454173489817557, "duration_time_ns": 20628, "correlation_id": 3097, "children": [], "parent": 94242239139040}, {"_name": "aten::select", "id": 94242237992320, "start_time_ns": 1656454173489814695, "duration_time_ns": 1630, "correlation_id": 3095, "children": [94242237993168], "parent": 94242239139040}, {"_name": "aten::as_strided", "id": 94242237993168, "start_time_ns": 1656454173489815568, "duration_time_ns": 267, "correlation_id": 3096, "children": [], "parent": 94242237992320}, {"_name": "aten::select", "id": 94242239487552, "start_time_ns": 1656454173489811667, "duration_time_ns": 2305, "correlation_id": 3093, "children": [94242237991792], "parent": 94242239139040}, {"_name": "aten::as_strided", "id": 94242237991792, "start_time_ns": 1656454173489812906, "duration_time_ns": 491, "correlation_id": 3094, "children": [], "parent": 94242239487552}, {"_name": "aten::copy_", "id": 94242239486912, "start_time_ns": 1656454173489771721, "duration_time_ns": 34924, "correlation_id": 3092, "children": [], "parent": 94242239139040}, {"_name": "aten::select", "id": 94242239485536, "start_time_ns": 1656454173489766717, "duration_time_ns": 2462, "correlation_id": 3090, "children": [94242239486384], "parent": 94242239139040}, {"_name": "aten::as_strided", "id": 94242239486384, "start_time_ns": 1656454173489767943, "duration_time_ns": 366, "correlation_id": 3091, "children": [], "parent": 94242239485536}, {"_name": "aten::select", "id": 94242239484368, "start_time_ns": 1656454173489760388, "duration_time_ns": 5433, "correlation_id": 3088, "children": [94242239485008], "parent": 94242239139040}, {"_name": "aten::as_strided", "id": 94242239485008, "start_time_ns": 1656454173489764139, "duration_time_ns": 858, "correlation_id": 3089, "children": [], "parent": 94242239484368}, {"_name": "aten::copy_", "id": 94242239483728, "start_time_ns": 1656454173444305057, "duration_time_ns": 45427507, "correlation_id": 3087, "children": [], "parent": 94242239139040}, {"_name": "aten::select", "id": 94242239482560, "start_time_ns": 1656454173444291864, "duration_time_ns": 8740, "correlation_id": 3085, "children": [94242239483200], "parent": 94242239139040}, {"_name": "aten::as_strided", "id": 94242239483200, "start_time_ns": 1656454173444296798, "duration_time_ns": 531, "correlation_id": 3086, "children": [], "parent": 94242239482560}, {"_name": "aten::select", "id": 94242239501088, "start_time_ns": 1656454173444252555, "duration_time_ns": 38328, "correlation_id": 3083, "children": [94242239501584], "parent": 94242239139040}, {"_name": "aten::as_strided", "id": 94242239501584, "start_time_ns": 1656454173444282394, "duration_time_ns": 3993, "correlation_id": 3084, "children": [], "parent": 94242239501088}, {"_name": "aten::matmul", "id": 94242239499936, "start_time_ns": 1656454173442384887, "duration_time_ns": 20958, "correlation_id": 3081, "children": [94242239500512], "parent": 94242239135296}, {"_name": "aten::mm", "id": 94242239500512, "start_time_ns": 1656454173442385493, "duration_time_ns": 19655, "correlation_id": 3082, "children": [], "parent": 94242239499936}, {"_name": "aten::matmul", "id": 94240979327296, "start_time_ns": 1656454173442360631, "duration_time_ns": 21026, "correlation_id": 3079, "children": [94242238916288], "parent": 94242239135296}, {"_name": "aten::mm", "id": 94242238916288, "start_time_ns": 1656454173442361296, "duration_time_ns": 19633, "correlation_id": 3080, "children": [], "parent": 94240979327296}, {"_name": "aten::matmul", "id": 94240979389888, "start_time_ns": 1656454173442325764, "duration_time_ns": 31593, "correlation_id": 3077, "children": [94240979374096], "parent": 94242239135296}, {"_name": "aten::mm", "id": 94240979374096, "start_time_ns": 1656454173442326275, "duration_time_ns": 30364, "correlation_id": 3078, "children": [], "parent": 94240979389888}, {"_name": "aten::matmul", "id": 94240979295904, "start_time_ns": 1656454173442289759, "duration_time_ns": 32569, "correlation_id": 3075, "children": [94240169025248], "parent": 94242239135296}, {"_name": "aten::mm", "id": 94240169025248, "start_time_ns": 1656454173442291934, "duration_time_ns": 29693, "correlation_id": 3076, "children": [], "parent": 94240979295904}, {"_name": "aten::matmul", "id": 94240979270032, "start_time_ns": 1656454173440014537, "duration_time_ns": 2254371, "correlation_id": 3073, "children": [94240979296288], "parent": 94242239135296}, {"_name": "aten::mm", "id": 94240979296288, "start_time_ns": 1656454173440019291, "duration_time_ns": 2245915, "correlation_id": 3074, "children": [], "parent": 94240979270032}, {"_name": "torch/profiler/profiler.py(472): __enter__", "id": 94242239135712, "start_time_ns": 1656454173438761183, "duration_time_ns": 1208076, "correlation_id": 0, "children": [94242239136128], "parent": 94242239135296}, {"_name": "torch/profiler/profiler.py(479): start", "id": 94242239136128, "start_time_ns": 1656454173438762066, "duration_time_ns": 1206947, "correlation_id": 0, "children": [94242239136544], "parent": 94242239135712}, {"_name": "torch/profiler/profiler.py(515): _transit_action", "id": 94242239136544, "start_time_ns": 1656454173438764183, "duration_time_ns": 1203897, "correlation_id": 0, "children": [94242239136960], "parent": 94242239136128}, {"_name": "torch/profiler/profiler.py(110): start_trace", "id": 94242239136960, "start_time_ns": 1656454173438766170, "duration_time_ns": 1200818, "correlation_id": 0, "children": [94242239137376, 94242238897424, 94242239137792], "parent": 94242239136544}, {"_name": "torch/profiler/profiler.py(189): _get_distributed_info", "id": 94242239137792, "start_time_ns": 1656454173439946391, "duration_time_ns": 20326, "correlation_id": 0, "children": [94242239138208, 94242239138624], "parent": 94242239136960}, {"_name": "torch/distributed/distributed_c10d.py(415): is_initialized", "id": 94242239138624, "start_time_ns": 1656454173439964257, "duration_time_ns": 2376, "correlation_id": 0, "children": [], "parent": 94242239137792}, {"_name": "torch/distributed/__init__.py(8): is_available", "id": 94242239138208, "start_time_ns": 1656454173439956583, "duration_time_ns": 5736, "correlation_id": 0, "children": [94242238897872], "parent": 94242239137792}, {"_name": "<built-in function hasattr>", "id": 94242238897872, "start_time_ns": 1656454173439960911, "duration_time_ns": 1344, "correlation_id": 0, "children": [], "parent": 94242239138208}, {"_name": "<built-in method kineto_available of PyCapsule object at 0x7fa02685d2d0>", "id": 94242238897424, "start_time_ns": 1656454173439940525, "duration_time_ns": 1813, "correlation_id": 0, "children": [], "parent": 94242239136960}, {"_name": "torch/autograd/profiler.py(205): _start_trace", "id": 94242239137376, "start_time_ns": 1656454173438766630, "duration_time_ns": 63314, "correlation_id": 0, "children": [], "parent": 94242239136960}]]
diff --git a/test/test_profiler.py b/test/profiler/test_profiler.py
similarity index 99%
rename from test/test_profiler.py
rename to test/profiler/test_profiler.py
index 378afcab3101..a53f1fc9c799 100644
--- a/test/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -1990,10 +1990,15 @@ def test_profiler_pattern_matcher_json_report(self):
         try:
             with open("./torchtidy_report.json") as f:
                 report = json.load(f)
-            self.assertTrue("test_profiler.py" in report)
-            self.assertTrue(len(report["test_profiler.py"]) > 0)
+
+            # It is platform dependent whether the path will include "profiler/"
+            keys = [k for k in report.keys() if k.endswith("test_profiler.py")]
+            self.assertEqual(len(keys), 1, f"{keys}")
+            entry = report[keys[0]]
+
+            self.assertTrue(len(entry) > 0)
             expected_fields = sorted(["line_number", "name", "url", "message"])
-            for event in report["test_profiler.py"]:
+            for event in entry:
                 actual_fields = sorted(event.keys())
                 self.assertEqual(expected_fields, actual_fields)
         finally:
diff --git a/test/test_profiler_tree.py b/test/profiler/test_profiler_tree.py
similarity index 99%
rename from test/test_profiler_tree.py
rename to test/profiler/test_profiler_tree.py
index 90fdd1118f49..592abcc915e7 100644
--- a/test/test_profiler_tree.py
+++ b/test/profiler/test_profiler_tree.py
@@ -143,14 +143,15 @@ def fmt_name(name: str) -> str:
         if IS_WINDOWS:
             name = name.replace('struct torch::autograd::AccumulateGrad', 'torch::autograd::AccumulateGrad')
 
-        match = re.match(r"(.*)\.py\(([0-9]+)\): (.*)$", name)
+        match = re.match(r"^(.*)\.py\(([0-9]+)\): (.*)$", name)
         if match:
             filename, _, fn = match.groups()
 
-            # This test can appear as `test/test_profiler_tree.py` depending on
-            # where it is run from.
-            if filename.endswith(os.path.splitext(__file__)[0]):
-                filename = os.path.split(os.path.splitext(__file__)[0])[1]
+            # This test can appear as `test/profiler/test_profiler_tree.py`
+            # depending on where it is run from.
+            test_file = os.path.splitext(os.path.split(__file__)[1])[0]
+            if filename.endswith(test_file):
+                filename = test_file
 
             # We test against a string literal, so all paths have to look like POSIX paths.
             filename = filename.replace(os.sep, "/")
diff --git a/test/profiler_utils_mock_events.json b/test/profiler_utils_mock_events.json
deleted file mode 100644
index 00fcfccdfe30..000000000000
--- a/test/profiler_utils_mock_events.json
+++ /dev/null
@@ -1 +0,0 @@
-[[{"_name": "aten::matmul", "_start_us": 1656454173440014, "_duration_us": 2254, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::mm", "_start_us": 1656454173440019, "_duration_us": 2246, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::matmul", "_start_us": 1656454173442289, "_duration_us": 33, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::mm", "_start_us": 1656454173442291, "_duration_us": 30, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::matmul", "_start_us": 1656454173442325, "_duration_us": 32, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::mm", "_start_us": 1656454173442326, "_duration_us": 30, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::matmul", "_start_us": 1656454173442360, "_duration_us": 21, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::mm", "_start_us": 1656454173442361, "_duration_us": 19, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::matmul", "_start_us": 1656454173442384, "_duration_us": 21, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::mm", "_start_us": 1656454173442385, "_duration_us": 20, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::select", "_start_us": 1656454173444252, "_duration_us": 38, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::as_strided", "_start_us": 1656454173444282, "_duration_us": 4, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::select", "_start_us": 1656454173444291, "_duration_us": 9, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::as_strided", "_start_us": 1656454173444296, "_duration_us": 1, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::copy_", "_start_us": 1656454173444305, "_duration_us": 45427, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::select", "_start_us": 1656454173489760, "_duration_us": 5, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::as_strided", "_start_us": 1656454173489764, "_duration_us": 0, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::select", "_start_us": 1656454173489766, "_duration_us": 3, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::as_strided", "_start_us": 1656454173489767, "_duration_us": 1, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::copy_", "_start_us": 1656454173489771, "_duration_us": 35, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::select", "_start_us": 1656454173489811, "_duration_us": 2, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::as_strided", "_start_us": 1656454173489812, "_duration_us": 1, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::select", "_start_us": 1656454173489814, "_duration_us": 2, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::as_strided", "_start_us": 1656454173489815, "_duration_us": 0, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::copy_", "_start_us": 1656454173489817, "_duration_us": 21, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::select", "_start_us": 1656454173489842, "_duration_us": 3, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::as_strided", "_start_us": 1656454173489844, "_duration_us": 0, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::select", "_start_us": 1656454173489846, "_duration_us": 1, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::as_strided", "_start_us": 1656454173489847, "_duration_us": 0, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::copy_", "_start_us": 1656454173489848, "_duration_us": 21, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::select", "_start_us": 1656454173489873, "_duration_us": 2, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::as_strided", "_start_us": 1656454173489874, "_duration_us": 0, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::select", "_start_us": 1656454173489875, "_duration_us": 2, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::as_strided", "_start_us": 1656454173489876, "_duration_us": 1, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::copy_", "_start_us": 1656454173489878, "_duration_us": 20, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::matmul", "_start_us": 1656454173489912, "_duration_us": 104, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::mm", "_start_us": 1656454173489916, "_duration_us": 99, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::matmul", "_start_us": 1656454173490026, "_duration_us": 25, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::mm", "_start_us": 1656454173490027, "_duration_us": 23, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::matmul", "_start_us": 1656454173490054, "_duration_us": 34, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::mm", "_start_us": 1656454173490055, "_duration_us": 32, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::matmul", "_start_us": 1656454173490091, "_duration_us": 21, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::mm", "_start_us": 1656454173490092, "_duration_us": 20, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::matmul", "_start_us": 1656454173490115, "_duration_us": 22, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "aten::mm", "_start_us": 1656454173490116, "_duration_us": 20, "_linked_correlation_id": 0, "_device_type": 0}, {"_name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "_start_us": 1656454173441289, "_duration_us": 2, "_linked_correlation_id": 3074, "_device_type": 0}, {"_name": "ampere_sgemm_128x64_nn", "_start_us": 1656454173443225, "_duration_us": 9296, "_linked_correlation_id": 3074, "_device_type": 1}, {"_name": "cudaLaunchKernel", "_start_us": 1656454173441296, "_duration_us": 963, "_linked_correlation_id": 3074, "_device_type": 0}, {"_name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "_start_us": 1656454173442309, "_duration_us": 1, "_linked_correlation_id": 3076, "_device_type": 0}, {"_name": "ampere_sgemm_128x64_nn", "_start_us": 1656454173452523, "_duration_us": 9296, "_linked_correlation_id": 3076, "_device_type": 1}, {"_name": "cudaLaunchKernel", "_start_us": 1656454173442312, "_duration_us": 7, "_linked_correlation_id": 3076, "_device_type": 0}, {"_name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "_start_us": 1656454173442346, "_duration_us": 0, "_linked_correlation_id": 3078, "_device_type": 0}, {"_name": "ampere_sgemm_128x64_nn", "_start_us": 1656454173461821, "_duration_us": 9293, "_linked_correlation_id": 3078, "_device_type": 1}, {"_name": "cudaLaunchKernel", "_start_us": 1656454173442348, "_duration_us": 6, "_linked_correlation_id": 3078, "_device_type": 0}, {"_name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "_start_us": 1656454173442371, "_duration_us": 0, "_linked_correlation_id": 3080, "_device_type": 0}, {"_name": "ampere_sgemm_128x64_nn", "_start_us": 1656454173471117, "_duration_us": 9295, "_linked_correlation_id": 3080, "_device_type": 1}, {"_name": "cudaLaunchKernel", "_start_us": 1656454173442373, "_duration_us": 5, "_linked_correlation_id": 3080, "_device_type": 0}, {"_name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "_start_us": 1656454173442395, "_duration_us": 0, "_linked_correlation_id": 3082, "_device_type": 0}, {"_name": "ampere_sgemm_128x64_nn", "_start_us": 1656454173480414, "_duration_us": 9297, "_linked_correlation_id": 3082, "_device_type": 1}, {"_name": "cudaLaunchKernel", "_start_us": 1656454173442397, "_duration_us": 6, "_linked_correlation_id": 3082, "_device_type": 0}, {"_name": "Memcpy HtoD (Pageable -> Device)", "_start_us": 1656454173489715, "_duration_us": 2, "_linked_correlation_id": 3087, "_device_type": 1}, {"_name": "cudaMemcpyAsync", "_start_us": 1656454173444325, "_duration_us": 24, "_linked_correlation_id": 3087, "_device_type": 0}, {"_name": "cudaStreamSynchronize", "_start_us": 1656454173444350, "_duration_us": 45377, "_linked_correlation_id": 3087, "_device_type": 0}, {"_name": "Memcpy HtoD (Pageable -> Device)", "_start_us": 1656454173489796, "_duration_us": 2, "_linked_correlation_id": 3092, "_device_type": 1}, {"_name": "cudaMemcpyAsync", "_start_us": 1656454173489777, "_duration_us": 14, "_linked_correlation_id": 3092, "_device_type": 0}, {"_name": "cudaStreamSynchronize", "_start_us": 1656454173489791, "_duration_us": 13, "_linked_correlation_id": 3092, "_device_type": 0}, {"_name": "Memcpy HtoD (Pageable -> Device)", "_start_us": 1656454173489828, "_duration_us": 2, "_linked_correlation_id": 3097, "_device_type": 1}, {"_name": "cudaMemcpyAsync", "_start_us": 1656454173489820, "_duration_us": 3, "_linked_correlation_id": 3097, "_device_type": 0}, {"_name": "cudaStreamSynchronize", "_start_us": 1656454173489824, "_duration_us": 13, "_linked_correlation_id": 3097, "_device_type": 0}, {"_name": "Memcpy HtoD (Pageable -> Device)", "_start_us": 1656454173489859, "_duration_us": 2, "_linked_correlation_id": 3102, "_device_type": 1}, {"_name": "cudaMemcpyAsync", "_start_us": 1656454173489851, "_duration_us": 3, "_linked_correlation_id": 3102, "_device_type": 0}, {"_name": "cudaStreamSynchronize", "_start_us": 1656454173489854, "_duration_us": 13, "_linked_correlation_id": 3102, "_device_type": 0}, {"_name": "Memcpy HtoD (Pageable -> Device)", "_start_us": 1656454173489889, "_duration_us": 2, "_linked_correlation_id": 3107, "_device_type": 1}, {"_name": "cudaMemcpyAsync", "_start_us": 1656454173489880, "_duration_us": 3, "_linked_correlation_id": 3107, "_device_type": 0}, {"_name": "cudaStreamSynchronize", "_start_us": 1656454173489884, "_duration_us": 12, "_linked_correlation_id": 3107, "_device_type": 0}, {"_name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "_start_us": 1656454173489972, "_duration_us": 3, "_linked_correlation_id": 3109, "_device_type": 0}, {"_name": "ampere_sgemm_128x64_nn", "_start_us": 1656454173490013, "_duration_us": 9302, "_linked_correlation_id": 3109, "_device_type": 1}, {"_name": "cudaLaunchKernel", "_start_us": 1656454173489980, "_duration_us": 32, "_linked_correlation_id": 3109, "_device_type": 0}, {"_name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "_start_us": 1656454173490040, "_duration_us": 0, "_linked_correlation_id": 3111, "_device_type": 0}, {"_name": "ampere_sgemm_128x64_nn", "_start_us": 1656454173499317, "_duration_us": 9306, "_linked_correlation_id": 3111, "_device_type": 1}, {"_name": "cudaLaunchKernel", "_start_us": 1656454173490042, "_duration_us": 7, "_linked_correlation_id": 3111, "_device_type": 0}, {"_name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "_start_us": 1656454173490076, "_duration_us": 0, "_linked_correlation_id": 3113, "_device_type": 0}, {"_name": "ampere_sgemm_128x64_nn", "_start_us": 1656454173508625, "_duration_us": 9299, "_linked_correlation_id": 3113, "_device_type": 1}, {"_name": "cudaLaunchKernel", "_start_us": 1656454173490078, "_duration_us": 7, "_linked_correlation_id": 3113, "_device_type": 0}, {"_name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "_start_us": 1656454173490102, "_duration_us": 0, "_linked_correlation_id": 3115, "_device_type": 0}, {"_name": "ampere_sgemm_128x64_nn", "_start_us": 1656454173517925, "_duration_us": 9300, "_linked_correlation_id": 3115, "_device_type": 1}, {"_name": "cudaLaunchKernel", "_start_us": 1656454173490104, "_duration_us": 5, "_linked_correlation_id": 3115, "_device_type": 0}, {"_name": "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "_start_us": 1656454173490126, "_duration_us": 0, "_linked_correlation_id": 3117, "_device_type": 0}, {"_name": "ampere_sgemm_128x64_nn", "_start_us": 1656454173527228, "_duration_us": 9301, "_linked_correlation_id": 3117, "_device_type": 1}, {"_name": "cudaLaunchKernel", "_start_us": 1656454173490128, "_duration_us": 6, "_linked_correlation_id": 3117, "_device_type": 0}, {"_name": "cudaDeviceSynchronize", "_start_us": 1656454173490313, "_duration_us": 46225, "_linked_correlation_id": 0, "_device_type": 0}], [{"_name": "test_profiler.py(1435): <module>", "id": 94242239505696, "start_time_ns": 1656454173436288169, "duration_time_ns": 7566917863418487639, "correlation_id": 0, "children": [94242238082288], "parent": null}, {"_name": "torch/testing/_internal/common_utils.py(697): run_tests", "id": 94242238082288, "start_time_ns": 1656454173438182431, "duration_time_ns": 7566917863416593377, "correlation_id": 0, "children": [94242238082800], "parent": 94242239505696}, {"_name": "unittest/main.py(101): __init__", "id": 94242238082800, "start_time_ns": 1656454173438184159, "duration_time_ns": 7566917863416591649, "correlation_id": 0, "children": [94242238083184], "parent": 94242238082288}, {"_name": "unittest/main.py(271): runTests", "id": 94242238083184, "start_time_ns": 1656454173438186629, "duration_time_ns": 7566917863416589179, "correlation_id": 0, "children": [94242238083568], "parent": 94242238082800}, {"_name": "unittest/runner.py(184): run", "id": 94242238083568, "start_time_ns": 1656454173438187601, "duration_time_ns": 7566917863416588207, "correlation_id": 0, "children": [94242238084128], "parent": 94242238083184}, {"_name": "unittest/suite.py(84): __call__", "id": 94242238084128, "start_time_ns": 1656454173438189531, "duration_time_ns": 7566917863416586277, "correlation_id": 0, "children": [94242238084544], "parent": 94242238083568}, {"_name": "unittest/suite.py(122): run", "id": 94242238084544, "start_time_ns": 1656454173438190205, "duration_time_ns": 7566917863416585603, "correlation_id": 0, "children": [94242238084960], "parent": 94242238084128}, {"_name": "unittest/suite.py(84): __call__", "id": 94242238084960, "start_time_ns": 1656454173438191228, "duration_time_ns": 7566917863416584580, "correlation_id": 0, "children": [94242238085376], "parent": 94242238084544}, {"_name": "unittest/suite.py(122): run", "id": 94242238085376, "start_time_ns": 1656454173438191346, "duration_time_ns": 7566917863416584462, "correlation_id": 0, "children": [94242238085792], "parent": 94242238084960}, {"_name": "unittest/case.py(651): __call__", "id": 94242238085792, "start_time_ns": 1656454173438191484, "duration_time_ns": 7566917863416584324, "correlation_id": 0, "children": [94242239133216], "parent": 94242238085376}, {"_name": "torch/testing/_internal/common_utils.py(1886): run", "id": 94242239133216, "start_time_ns": 1656454173438195759, "duration_time_ns": 7566917863416580049, "correlation_id": 0, "children": [94242239133632], "parent": 94242238085792}, {"_name": "torch/testing/_internal/common_utils.py(1829): _run_with_retry", "id": 94242239133632, "start_time_ns": 1656454173438197353, "duration_time_ns": 7566917863416578455, "correlation_id": 0, "children": [94242239134048], "parent": 94242239133216}, {"_name": "unittest/case.py(592): run", "id": 94242239134048, "start_time_ns": 1656454173438198172, "duration_time_ns": 7566917863416577636, "correlation_id": 0, "children": [94242239134464], "parent": 94242239133632}, {"_name": "unittest/case.py(550): _callTestMethod", "id": 94242239134464, "start_time_ns": 1656454173438211703, "duration_time_ns": 7566917863416564105, "correlation_id": 0, "children": [94242239134880], "parent": 94242239134048}, {"_name": "test_profiler.py(1420): test_utils_get_optimizable_events", "id": 94242239134880, "start_time_ns": 1656454173438759703, "duration_time_ns": 7566917863416016105, "correlation_id": 0, "children": [94242239135296], "parent": 94242239134464}, {"_name": "test_profiler.py(1251): load_mock_profile", "id": 94242239135296, "start_time_ns": 1656454173438760534, "duration_time_ns": 7566917863416015274, "correlation_id": 0, "children": [94242239135712, 94240979270032, 94240979295904, 94240979389888, 94240979327296, 94242239499936, 94242239139040, 94242239299696, 94242239301040, 94242239302384, 94242239303728, 94242239305072, 94242239139456], "parent": 94242239134880}, {"_name": "torch/profiler/profiler.py(475): __exit__", "id": 94242239139456, "start_time_ns": 1656454173490143177, "duration_time_ns": 7566917863364632631, "correlation_id": 0, "children": [94242239139872], "parent": 94242239135296}, {"_name": "torch/profiler/profiler.py(484): stop", "id": 94242239139872, "start_time_ns": 1656454173490151443, "duration_time_ns": 7566917863364624365, "correlation_id": 0, "children": [94242239140288], "parent": 94242239139456}, {"_name": "torch/profiler/profiler.py(511): _transit_action", "id": 94242239140288, "start_time_ns": 1656454173490160200, "duration_time_ns": 7566917863364615608, "correlation_id": 0, "children": [94242238898288, 94242238886608], "parent": 94242239139872}, {"_name": "torch/profiler/profiler.py(117): stop_trace", "id": 94242238886608, "start_time_ns": 1656454173490212930, "duration_time_ns": 7566917863364562878, "correlation_id": 0, "children": [94242238887024], "parent": 94242239140288}, {"_name": "torch/autograd/profiler.py(207): __exit__", "id": 94242238887024, "start_time_ns": 1656454173490216323, "duration_time_ns": 7566917863364559485, "correlation_id": 0, "children": [94242238887440], "parent": 94242238886608}, {"_name": "torch/cuda/__init__.py(486): synchronize", "id": 94242238887440, "start_time_ns": 1656454173490222710, "duration_time_ns": 7566917863364553098, "correlation_id": 0, "children": [94242238887856, 94242238888688, 94242238894096, 94242239121280, 94242238897008], "parent": 94242238887024}, {"_name": "torch/cuda/__init__.py(281): __exit__", "id": 94242238897008, "start_time_ns": 1656454173536540711, "duration_time_ns": 7566917863318235097, "correlation_id": 0, "children": [94242239121696], "parent": 94242238887440}, {"_name": "<built-in method _disable_profiler of PyCapsule object at 0x7fa02685d210>", "id": 94242239121696, "start_time_ns": 1656454173536553153, "duration_time_ns": 7566917863318222655, "correlation_id": 0, "children": [], "parent": 94242238897008}, {"_name": "<built-in function _cuda_synchronize>", "id": 94242239121280, "start_time_ns": 1656454173490312079, "duration_time_ns": 46227101, "correlation_id": 0, "children": [], "parent": 94242238887440}, {"_name": "torch/cuda/__init__.py(272): __enter__", "id": 94242238894096, "start_time_ns": 1656454173490303577, "duration_time_ns": 5394, "correlation_id": 0, "children": [94242238894512, 94242238895760, 94242238896176], "parent": 94242238887440}, {"_name": "torch/cuda/__init__.py(191): _lazy_init", "id": 94242238896176, "start_time_ns": 1656454173490308123, "duration_time_ns": 792, "correlation_id": 0, "children": [94242238896592], "parent": 94242238894096}, {"_name": "torch/cuda/__init__.py(149): is_initialized", "id": 94242238896592, "start_time_ns": 1656454173490308633, "duration_time_ns": 242, "correlation_id": 0, "children": [94242239120864], "parent": 94242238896176}, {"_name": "<built-in function _cuda_isInBadFork>", "id": 94242239120864, "start_time_ns": 1656454173490308734, "duration_time_ns": 121, "correlation_id": 0, "children": [], "parent": 94242238896592}, {"_name": "torch/_jit_internal.py(982): is_scripting", "id": 94242238895760, "start_time_ns": 1656454173490307337, "duration_time_ns": 660, "correlation_id": 0, "children": [], "parent": 94242238894096}, {"_name": "torch/cuda/__init__.py(480): current_device", "id": 94242238894512, "start_time_ns": 1656454173490304532, "duration_time_ns": 2250, "correlation_id": 0, "children": [94242238894928, 94242239120448], "parent": 94242238894096}, {"_name": "<built-in function _cuda_getDevice>", "id": 94242239120448, "start_time_ns": 1656454173490305817, "duration_time_ns": 934, "correlation_id": 0, "children": [], "parent": 94242238894512}, {"_name": "torch/cuda/__init__.py(191): _lazy_init", "id": 94242238894928, "start_time_ns": 1656454173490305205, "duration_time_ns": 400, "correlation_id": 0, "children": [94242238895344], "parent": 94242238894512}, {"_name": "torch/cuda/__init__.py(149): is_initialized", "id": 94242238895344, "start_time_ns": 1656454173490305315, "duration_time_ns": 249, "correlation_id": 0, "children": [94242239120032], "parent": 94242238894928}, {"_name": "<built-in function _cuda_isInBadFork>", "id": 94242239120032, "start_time_ns": 1656454173490305469, "duration_time_ns": 64, "correlation_id": 0, "children": [], "parent": 94242238895344}, {"_name": "torch/cuda/__init__.py(268): __init__", "id": 94242238888688, "start_time_ns": 1656454173490238187, "duration_time_ns": 63856, "correlation_id": 0, "children": [94242238889104], "parent": 94242238887440}, {"_name": "torch/cuda/_utils.py(7): _get_device_index", "id": 94242238889104, "start_time_ns": 1656454173490241229, "duration_time_ns": 59393, "correlation_id": 0, "children": [94242239113392, 94242239113808, 94242238889520, 94242239114224, 94242238889936], "parent": 94242238888688}, {"_name": "torch/_utils.py(521): _get_device_index", "id": 94242238889936, "start_time_ns": 1656454173490254695, "duration_time_ns": 45728, "correlation_id": 0, "children": [94242239114640, 94242239115056, 94242239117536, 94242238890352, 94242238890768], "parent": 94242238889104}, {"_name": "torch/_utils.py(497): _get_current_device_index", "id": 94242238890768, "start_time_ns": 1656454173490269804, "duration_time_ns": 30489, "correlation_id": 0, "children": [94242238891184], "parent": 94242238889936}, {"_name": "torch/_utils.py(487): _get_device_attr", "id": 94242238891184, "start_time_ns": 1656454173490277921, "duration_time_ns": 22112, "correlation_id": 0, "children": [94242238891600, 94242239118784, 94242238892432], "parent": 94242238890768}, {"_name": "torch/_utils.py(499): <lambda>", "id": 94242238892432, "start_time_ns": 1656454173490290622, "duration_time_ns": 9269, "correlation_id": 0, "children": [94242238892848], "parent": 94242238891184}, {"_name": "torch/cuda/__init__.py(480): current_device", "id": 94242238892848, "start_time_ns": 1656454173490292572, "duration_time_ns": 7253, "correlation_id": 0, "children": [94242238893264, 94242239119616], "parent": 94242238892432}, {"_name": "<built-in function _cuda_getDevice>", "id": 94242239119616, "start_time_ns": 1656454173490296196, "duration_time_ns": 3565, "correlation_id": 0, "children": [], "parent": 94242238892848}, {"_name": "torch/cuda/__init__.py(191): _lazy_init", "id": 94242238893264, "start_time_ns": 1656454173490293743, "duration_time_ns": 1072, "correlation_id": 0, "children": [94242238893680], "parent": 94242238892848}, {"_name": "torch/cuda/__init__.py(149): is_initialized", "id": 94242238893680, "start_time_ns": 1656454173490294339, "duration_time_ns": 402, "correlation_id": 0, "children": [94242239119200], "parent": 94242238893264}, {"_name": "<built-in function _cuda_isInBadFork>", "id": 94242239119200, "start_time_ns": 1656454173490294551, "duration_time_ns": 124, "correlation_id": 0, "children": [], "parent": 94242238893680}, {"_name": "<built-in method get of dict object at 0x7fa01c47d700>", "id": 94242239118784, "start_time_ns": 1656454173490289374, "duration_time_ns": 241, "correlation_id": 0, "children": [], "parent": 94242238891184}, {"_name": "torch/_utils.py(478): _get_available_device_type", "id": 94242238891600, "start_time_ns": 1656454173490280148, "duration_time_ns": 8003, "correlation_id": 0, "children": [94242238892016], "parent": 94242238891184}, {"_name": "torch/cuda/__init__.py(77): is_available", "id": 94242238892016, "start_time_ns": 1656454173490282141, "duration_time_ns": 5804, "correlation_id": 0, "children": [94242239117952, 94242239118368], "parent": 94242238891600}, {"_name": "<built-in function _cuda_getDeviceCount>", "id": 94242239118368, "start_time_ns": 1656454173490286599, "duration_time_ns": 1061, "correlation_id": 0, "children": [], "parent": 94242238892016}, {"_name": "<built-in function hasattr>", "id": 94242239117952, "start_time_ns": 1656454173490284307, "duration_time_ns": 988, "correlation_id": 0, "children": [], "parent": 94242238892016}, {"_name": "torch/_jit_internal.py(982): is_scripting", "id": 94242238890352, "start_time_ns": 1656454173490268636, "duration_time_ns": 383, "correlation_id": 0, "children": [], "parent": 94242238889936}, {"_name": "<built-in function isinstance>", "id": 94242239117536, "start_time_ns": 1656454173490268135, "duration_time_ns": 45, "correlation_id": 0, "children": [], "parent": 94242238889936}, {"_name": "<built-in function isinstance>", "id": 94242239115056, "start_time_ns": 1656454173490266016, "duration_time_ns": 43, "correlation_id": 0, "children": [], "parent": 94242238889936}, {"_name": "<built-in function isinstance>", "id": 94242239114640, "start_time_ns": 1656454173490264843, "duration_time_ns": 71, "correlation_id": 0, "children": [], "parent": 94242238889936}, {"_name": "<built-in function isinstance>", "id": 94242239114224, "start_time_ns": 1656454173490253455, "duration_time_ns": 56, "correlation_id": 0, "children": [], "parent": 94242238889104}, {"_name": "torch/_jit_internal.py(982): is_scripting", "id": 94242238889520, "start_time_ns": 1656454173490250344, "duration_time_ns": 2192, "correlation_id": 0, "children": [], "parent": 94242238889104}, {"_name": "<built-in function isinstance>", "id": 94242239113808, "start_time_ns": 1656454173490247257, "duration_time_ns": 104, "correlation_id": 0, "children": [], "parent": 94242238889104}, {"_name": "<built-in function isinstance>", "id": 94242239113392, "start_time_ns": 1656454173490245162, "duration_time_ns": 807, "correlation_id": 0, "children": [], "parent": 94242238889104}, {"_name": "torch/cuda/__init__.py(191): _lazy_init", "id": 94242238887856, "start_time_ns": 1656454173490224967, "duration_time_ns": 10586, "correlation_id": 0, "children": [94242238888272], "parent": 94242238887440}, {"_name": "torch/cuda/__init__.py(149): is_initialized", "id": 94242238888272, "start_time_ns": 1656454173490227128, "duration_time_ns": 8241, "correlation_id": 0, "children": [94242239113008], "parent": 94242238887856}, {"_name": "<built-in function _cuda_isInBadFork>", "id": 94242239113008, "start_time_ns": 1656454173490234177, "duration_time_ns": 892, "correlation_id": 0, "children": [], "parent": 94242238888272}, {"_name": "<built-in method get of dict object at 0x7fa01c47d700>", "id": 94242238898288, "start_time_ns": 1656454173490187641, "duration_time_ns": 9517, "correlation_id": 0, "children": [94242239140704], "parent": 94242239140288}, {"_name": "enum.py(774): __hash__", "id": 94242239140704, "start_time_ns": 1656454173490190439, "duration_time_ns": 5319, "correlation_id": 0, "children": [94242239112592], "parent": 94242238898288}, {"_name": "<built-in function hash>", "id": 94242239112592, "start_time_ns": 1656454173490194870, "duration_time_ns": 721, "correlation_id": 0, "children": [], "parent": 94242239140704}, {"_name": "aten::matmul", "id": 94242239305072, "start_time_ns": 1656454173490115971, "duration_time_ns": 21513, "correlation_id": 3116, "children": [94242239305744], "parent": 94242239135296}, {"_name": "aten::mm", "id": 94242239305744, "start_time_ns": 1656454173490116650, "duration_time_ns": 20114, "correlation_id": 3117, "children": [], "parent": 94242239305072}, {"_name": "aten::matmul", "id": 94242239303728, "start_time_ns": 1656454173490091388, "duration_time_ns": 21342, "correlation_id": 3114, "children": [94242239304400], "parent": 94242239135296}, {"_name": "aten::mm", "id": 94242239304400, "start_time_ns": 1656454173490092214, "duration_time_ns": 19792, "correlation_id": 3115, "children": [], "parent": 94242239303728}, {"_name": "aten::matmul", "id": 94242239302384, "start_time_ns": 1656454173490054842, "duration_time_ns": 33225, "correlation_id": 3112, "children": [94242239303056], "parent": 94242239135296}, {"_name": "aten::mm", "id": 94242239303056, "start_time_ns": 1656454173490055485, "duration_time_ns": 31894, "correlation_id": 3113, "children": [], "parent": 94242239302384}, {"_name": "aten::matmul", "id": 94242239301040, "start_time_ns": 1656454173490026585, "duration_time_ns": 24997, "correlation_id": 3110, "children": [94242239301712], "parent": 94242239135296}, {"_name": "aten::mm", "id": 94242239301712, "start_time_ns": 1656454173490027380, "duration_time_ns": 23566, "correlation_id": 3111, "children": [], "parent": 94242239301040}, {"_name": "aten::matmul", "id": 94242239299696, "start_time_ns": 1656454173489912600, "duration_time_ns": 104156, "correlation_id": 3108, "children": [94242239300368], "parent": 94242239135296}, {"_name": "aten::mm", "id": 94242239300368, "start_time_ns": 1656454173489916106, "duration_time_ns": 99633, "correlation_id": 3109, "children": [], "parent": 94242239299696}, {"_name": "test_profiler.py(1245): garbage_code", "id": 94242239139040, "start_time_ns": 1656454173442410244, "duration_time_ns": 47490938, "correlation_id": 0, "children": [94242239501088, 94242239482560, 94242239483728, 94242239484368, 94242239485536, 94242239486912, 94242239487552, 94242237992320, 94242237993696, 94242237994336, 94242237995712, 94242237997120, 94242237997856, 94242237999328, 94242238000800], "parent": 94242239135296}, {"_name": "aten::copy_", "id": 94242238000800, "start_time_ns": 1656454173489878232, "duration_time_ns": 20288, "correlation_id": 3107, "children": [], "parent": 94242239139040}, {"_name": "aten::select", "id": 94242237999328, "start_time_ns": 1656454173489875969, "duration_time_ns": 1490, "correlation_id": 3105, "children": [94242238000240], "parent": 94242239139040}, {"_name": "aten::as_strided", "id": 94242238000240, "start_time_ns": 1656454173489876749, "duration_time_ns": 269, "correlation_id": 3106, "children": [], "parent": 94242237999328}, {"_name": "aten::select", "id": 94242237997856, "start_time_ns": 1656454173489873022, "duration_time_ns": 2173, "correlation_id": 3103, "children": [94242237998768], "parent": 94242239139040}, {"_name": "aten::as_strided", "id": 94242237998768, "start_time_ns": 1656454173489874129, "duration_time_ns": 436, "correlation_id": 3104, "children": [], "parent": 94242237997856}, {"_name": "aten::copy_", "id": 94242237997120, "start_time_ns": 1656454173489848771, "duration_time_ns": 20290, "correlation_id": 3102, "children": [], "parent": 94242239139040}, {"_name": "aten::select", "id": 94242237995712, "start_time_ns": 1656454173489846145, "duration_time_ns": 1571, "correlation_id": 3100, "children": [94242237996560], "parent": 94242239139040}, {"_name": "aten::as_strided", "id": 94242237996560, "start_time_ns": 1656454173489847021, "duration_time_ns": 204, "correlation_id": 3101, "children": [], "parent": 94242237995712}, {"_name": "aten::select", "id": 94242237994336, "start_time_ns": 1656454173489842325, "duration_time_ns": 3114, "correlation_id": 3098, "children": [94242237995184], "parent": 94242239139040}, {"_name": "aten::as_strided", "id": 94242237995184, "start_time_ns": 1656454173489844409, "duration_time_ns": 440, "correlation_id": 3099, "children": [], "parent": 94242237994336}, {"_name": "aten::copy_", "id": 94242237993696, "start_time_ns": 1656454173489817557, "duration_time_ns": 20628, "correlation_id": 3097, "children": [], "parent": 94242239139040}, {"_name": "aten::select", "id": 94242237992320, "start_time_ns": 1656454173489814695, "duration_time_ns": 1630, "correlation_id": 3095, "children": [94242237993168], "parent": 94242239139040}, {"_name": "aten::as_strided", "id": 94242237993168, "start_time_ns": 1656454173489815568, "duration_time_ns": 267, "correlation_id": 3096, "children": [], "parent": 94242237992320}, {"_name": "aten::select", "id": 94242239487552, "start_time_ns": 1656454173489811667, "duration_time_ns": 2305, "correlation_id": 3093, "children": [94242237991792], "parent": 94242239139040}, {"_name": "aten::as_strided", "id": 94242237991792, "start_time_ns": 1656454173489812906, "duration_time_ns": 491, "correlation_id": 3094, "children": [], "parent": 94242239487552}, {"_name": "aten::copy_", "id": 94242239486912, "start_time_ns": 1656454173489771721, "duration_time_ns": 34924, "correlation_id": 3092, "children": [], "parent": 94242239139040}, {"_name": "aten::select", "id": 94242239485536, "start_time_ns": 1656454173489766717, "duration_time_ns": 2462, "correlation_id": 3090, "children": [94242239486384], "parent": 94242239139040}, {"_name": "aten::as_strided", "id": 94242239486384, "start_time_ns": 1656454173489767943, "duration_time_ns": 366, "correlation_id": 3091, "children": [], "parent": 94242239485536}, {"_name": "aten::select", "id": 94242239484368, "start_time_ns": 1656454173489760388, "duration_time_ns": 5433, "correlation_id": 3088, "children": [94242239485008], "parent": 94242239139040}, {"_name": "aten::as_strided", "id": 94242239485008, "start_time_ns": 1656454173489764139, "duration_time_ns": 858, "correlation_id": 3089, "children": [], "parent": 94242239484368}, {"_name": "aten::copy_", "id": 94242239483728, "start_time_ns": 1656454173444305057, "duration_time_ns": 45427507, "correlation_id": 3087, "children": [], "parent": 94242239139040}, {"_name": "aten::select", "id": 94242239482560, "start_time_ns": 1656454173444291864, "duration_time_ns": 8740, "correlation_id": 3085, "children": [94242239483200], "parent": 94242239139040}, {"_name": "aten::as_strided", "id": 94242239483200, "start_time_ns": 1656454173444296798, "duration_time_ns": 531, "correlation_id": 3086, "children": [], "parent": 94242239482560}, {"_name": "aten::select", "id": 94242239501088, "start_time_ns": 1656454173444252555, "duration_time_ns": 38328, "correlation_id": 3083, "children": [94242239501584], "parent": 94242239139040}, {"_name": "aten::as_strided", "id": 94242239501584, "start_time_ns": 1656454173444282394, "duration_time_ns": 3993, "correlation_id": 3084, "children": [], "parent": 94242239501088}, {"_name": "aten::matmul", "id": 94242239499936, "start_time_ns": 1656454173442384887, "duration_time_ns": 20958, "correlation_id": 3081, "children": [94242239500512], "parent": 94242239135296}, {"_name": "aten::mm", "id": 94242239500512, "start_time_ns": 1656454173442385493, "duration_time_ns": 19655, "correlation_id": 3082, "children": [], "parent": 94242239499936}, {"_name": "aten::matmul", "id": 94240979327296, "start_time_ns": 1656454173442360631, "duration_time_ns": 21026, "correlation_id": 3079, "children": [94242238916288], "parent": 94242239135296}, {"_name": "aten::mm", "id": 94242238916288, "start_time_ns": 1656454173442361296, "duration_time_ns": 19633, "correlation_id": 3080, "children": [], "parent": 94240979327296}, {"_name": "aten::matmul", "id": 94240979389888, "start_time_ns": 1656454173442325764, "duration_time_ns": 31593, "correlation_id": 3077, "children": [94240979374096], "parent": 94242239135296}, {"_name": "aten::mm", "id": 94240979374096, "start_time_ns": 1656454173442326275, "duration_time_ns": 30364, "correlation_id": 3078, "children": [], "parent": 94240979389888}, {"_name": "aten::matmul", "id": 94240979295904, "start_time_ns": 1656454173442289759, "duration_time_ns": 32569, "correlation_id": 3075, "children": [94240169025248], "parent": 94242239135296}, {"_name": "aten::mm", "id": 94240169025248, "start_time_ns": 1656454173442291934, "duration_time_ns": 29693, "correlation_id": 3076, "children": [], "parent": 94240979295904}, {"_name": "aten::matmul", "id": 94240979270032, "start_time_ns": 1656454173440014537, "duration_time_ns": 2254371, "correlation_id": 3073, "children": [94240979296288], "parent": 94242239135296}, {"_name": "aten::mm", "id": 94240979296288, "start_time_ns": 1656454173440019291, "duration_time_ns": 2245915, "correlation_id": 3074, "children": [], "parent": 94240979270032}, {"_name": "torch/profiler/profiler.py(472): __enter__", "id": 94242239135712, "start_time_ns": 1656454173438761183, "duration_time_ns": 1208076, "correlation_id": 0, "children": [94242239136128], "parent": 94242239135296}, {"_name": "torch/profiler/profiler.py(479): start", "id": 94242239136128, "start_time_ns": 1656454173438762066, "duration_time_ns": 1206947, "correlation_id": 0, "children": [94242239136544], "parent": 94242239135712}, {"_name": "torch/profiler/profiler.py(515): _transit_action", "id": 94242239136544, "start_time_ns": 1656454173438764183, "duration_time_ns": 1203897, "correlation_id": 0, "children": [94242239136960], "parent": 94242239136128}, {"_name": "torch/profiler/profiler.py(110): start_trace", "id": 94242239136960, "start_time_ns": 1656454173438766170, "duration_time_ns": 1200818, "correlation_id": 0, "children": [94242239137376, 94242238897424, 94242239137792], "parent": 94242239136544}, {"_name": "torch/profiler/profiler.py(189): _get_distributed_info", "id": 94242239137792, "start_time_ns": 1656454173439946391, "duration_time_ns": 20326, "correlation_id": 0, "children": [94242239138208, 94242239138624], "parent": 94242239136960}, {"_name": "torch/distributed/distributed_c10d.py(415): is_initialized", "id": 94242239138624, "start_time_ns": 1656454173439964257, "duration_time_ns": 2376, "correlation_id": 0, "children": [], "parent": 94242239137792}, {"_name": "torch/distributed/__init__.py(8): is_available", "id": 94242239138208, "start_time_ns": 1656454173439956583, "duration_time_ns": 5736, "correlation_id": 0, "children": [94242238897872], "parent": 94242239137792}, {"_name": "<built-in function hasattr>", "id": 94242238897872, "start_time_ns": 1656454173439960911, "duration_time_ns": 1344, "correlation_id": 0, "children": [], "parent": 94242239138208}, {"_name": "<built-in method kineto_available of PyCapsule object at 0x7fa02685d2d0>", "id": 94242238897424, "start_time_ns": 1656454173439940525, "duration_time_ns": 1813, "correlation_id": 0, "children": [], "parent": 94242239136960}, {"_name": "torch/autograd/profiler.py(205): _start_trace", "id": 94242239137376, "start_time_ns": 1656454173438766630, "duration_time_ns": 63314, "correlation_id": 0, "children": [], "parent": 94242239136960}]]
diff --git a/test/quantization/ao_migration/test_ao_migration.py b/test/quantization/ao_migration/test_ao_migration.py
index e3cddd490d2a..161cc3226cb3 100644
--- a/test/quantization/ao_migration/test_ao_migration.py
+++ b/test/quantization/ao_migration/test_ao_migration.py
@@ -120,6 +120,7 @@ def test_package_import_nn_quantized(self):
             'functional_modules',
             'linear',
             'normalization',
+            '_reference',
         ]
         self._test_package_import('quantized', base='nn', skip=skip)
 
@@ -304,77 +305,6 @@ def test_import_nn_quantized_dynamic_import(self):
         ]
         self._test_function_import('dynamic', module_list, base='nn.quantized')
 
-    def test_package_import_nn_quantized_reference(self):
-        self._test_package_import('_reference', base='nn.quantized')
-
-    def test_package_import_nn_quantized_reference_modules(self):
-        r"""Tests the migration of the torch.nn.quantized._reference.modules"""
-        self._test_package_import('modules', base='nn.quantized._reference')
-        self._test_package_import('modules.conv', base='nn.quantized._reference')
-        self._test_package_import('modules.linear', base='nn.quantized._reference')
-        self._test_package_import('modules.rnn', base='nn.quantized._reference')
-        self._test_package_import('modules.sparse', base='nn.quantized._reference')
-
-    def test_import_nn_quantized_reference_import(self):
-        module_list = [
-            # Modules
-            'Linear',
-            'Conv1d',
-            'Conv2d',
-            'Conv3d',
-            'ConvTranspose1d',
-            'ConvTranspose2d',
-            'ConvTranspose3d',
-            'RNNCell',
-            'LSTMCell',
-            'GRUCell',
-            'LSTM',
-            'Embedding',
-            'EmbeddingBag',
-        ]
-        self._test_function_import('_reference', module_list, base='nn.quantized')
-
-    def test_reference_modules_conv(self):
-        function_list = [
-            '_ConvNd',
-            'Conv1d',
-            'Conv2d',
-            'Conv3d',
-            '_ConvTransposeNd',
-            'ConvTranspose1d',
-            'ConvTranspose2d',
-            'ConvTranspose3d',
-        ]
-        self._test_function_import('conv', function_list,
-                                   base='nn.quantized._reference.modules')
-
-    def test_reference_modules_linear(self):
-        function_list = [
-            'Linear',
-        ]
-        self._test_function_import('linear', function_list,
-                                   base='nn.quantized._reference.modules')
-
-    def test_reference_modules_rnn(self):
-        function_list = [
-            'RNNCellBase',
-            'RNNCell',
-            'LSTMCell',
-            'GRUCell',
-            'RNNBase',
-            'LSTM',
-        ]
-        self._test_function_import('rnn', function_list,
-                                   base='nn.quantized._reference.modules')
-
-    def test_reference_modules_sparse(self):
-        function_list = [
-            'Embedding',
-            'EmbeddingBag',
-        ]
-        self._test_function_import('sparse', function_list,
-                                   base='nn.quantized._reference.modules')
-
     def test_package_import_nn_quantizable(self):
         self._test_package_import('quantizable', base='nn')
 
diff --git a/test/quantization/core/test_quantized_module.py b/test/quantization/core/test_quantized_module.py
index 85fc8e14c23f..5964de70b8e3 100644
--- a/test/quantization/core/test_quantized_module.py
+++ b/test/quantization/core/test_quantized_module.py
@@ -4,7 +4,7 @@
 import torch.nn as nn
 import torch.nn.intrinsic as nni
 import torch.nn.intrinsic.quantized as nniq
-import torch.ao.nn.quantized._reference as nnqr
+import torch.ao.nn.quantized.reference as nnqr
 import torch.ao.quantization
 import torch.ao.nn.quantized as nnq
 import torch.ao.nn.quantized.dynamic as nnqd
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 17d1b0a75238..2b6e9a9cf007 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -7,13 +7,12 @@
 import torch.nn.functional as F
 import torch.nn as nn
 import torch.ao.nn.quantized as nnq
-import torch.ao.nn.quantized._reference as nnqr
+import torch.ao.nn.quantized.reference as nnqr
 import torch.ao.nn.quantized.dynamic as nnqd
 import torch.nn.intrinsic as nni
 import torch.nn.intrinsic.quantized as nniq
 import torch.nn.intrinsic.quantized.dynamic as nniqd
 import torch.multiprocessing as mp
-from torch.ao.quantization import is_activation_post_process
 
 # graph mode quantization based on fx
 from torch.ao.quantization.quantize_fx import (
@@ -53,6 +52,7 @@
     get_default_qat_qconfig,
     get_default_qconfig_mapping,
     get_default_qat_qconfig_mapping,
+    is_activation_post_process,
     fuse_modules,
     fuse_modules_qat,
     prepare,
@@ -5924,7 +5924,8 @@ def forward(self, x):
             model,
             (torch.rand(5, 5),),
             QuantType.STATIC,
-            expected_node_occurrence=expected_occurrence
+            expected_node_occurrence=expected_occurrence,
+            custom_qconfig_dict=get_default_qconfig_mapping().to_dict()
         )
 
     def _test_default_node_quant_handler_ops(
@@ -6224,6 +6225,31 @@ def forward(self, x):
             M(), data, quant_type, custom_qconfig_dict=qconfig_mapping,
             expected_node_occurrence=node_occurrence, is_reference=True)
 
+    def test_fixed_qparams_ops_qconfig_error(self):
+        """ Test that a proper error message is shown when user don't specify the correct
+        qconfig for fixed qaprams ops
+        """
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.sigmoid = torch.nn.Sigmoid()
+                self.tanh = torch.nn.Tanh()
+
+            def forward(self, x):
+                x = self.sigmoid(x)
+                x = torch.sigmoid(x)
+                x = x.sigmoid()
+                x = self.tanh(x)
+                x = torch.tanh(x)
+                x = x.tanh()
+                return x
+
+        data = (torch.randn((2, 2, 2, 2), dtype=torch.float),)
+        qconfig_mapping = QConfigMapping().set_global(default_qconfig)
+        m = M().eval()
+        with self.assertRaisesRegex(ValueError, "get_default_qconfig_mapping"):
+            m = prepare_fx(m, qconfig_mapping, data)
+
     @skipIfNoFBGEMM
     def test_general_shape_ops(self):
         """ A test that checks dequantize will be swapped for
diff --git a/test/run_test.py b/test/run_test.py
index c2200943f868..9651f7f7a74b 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -247,7 +247,6 @@ def skip_test_p(name: str) -> bool:
     "distributed/_shard/test_replicated_tensor",
     "test_determination",
     "test_jit_legacy",
-    "test_openmp",
 ]
 
 RUN_PARALLEL_BLOCKLIST = [
@@ -299,6 +298,14 @@ def skip_test_p(name: str) -> bool:
             "WORLD_SIZE": "2" if torch.cuda.device_count() == 2 else "3",
             "TEST_REPORT_SOURCE_OVERRIDE": "dist-gloo",
         }
+    if dist.is_ucc_available():
+        DISTRIBUTED_TESTS_CONFIG["ucc"] = {
+            "WORLD_SIZE": "2" if torch.cuda.device_count() == 2 else "3",
+            "TEST_REPORT_SOURCE_OVERRIDE": "dist-ucc",
+            "UCX_TLS": "tcp",
+            "UCC_TLS": "nccl,ucp",
+            "UCC_TL_UCP_TUNE": "cuda:0",  # don't use UCP TL on CUDA as it is not well supported
+        }
 
 # https://stackoverflow.com/questions/2549939/get-signal-names-from-numbers-in-python
 SIGNALS_TO_NAMES_DICT = {
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 341aea5c9194..6da0c9f3062c 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -3744,20 +3744,6 @@ def run_fn(a):
                 out.backward()
 
     # TODO: update these tests to use the linalg module and move to test_linalg.py
-    @skipIfNoLapack
-    def test_eig_no_eigenvectors(self):
-        A = torch.tensor([[1., 2.], [2., 4.]], dtype=torch.float32, requires_grad=True)
-        w, v = torch.eig(A, eigenvectors=False)
-        with self.assertRaisesRegex(RuntimeError, 'is not differentiable'):
-            torch.autograd.backward([w, v], [torch.ones_like(w), torch.ones_like(v)])
-
-    @skipIfNoLapack
-    def test_eig_complex_eigenvalues(self):
-        A = torch.tensor([[0., -1.], [1., 0.]], dtype=torch.float32, requires_grad=True)
-        w, v = torch.eig(A, eigenvectors=True)
-        with self.assertRaisesRegex(RuntimeError, 'does not support complex eigenvalues'):
-            torch.autograd.backward([w, v], [torch.ones_like(w), torch.ones_like(v)])
-
     @skipIfNoLapack
     def test_symeig_no_eigenvectors(self):
         A = torch.tensor([[1., 2.], [2., 4.]], dtype=torch.float32, requires_grad=True)
diff --git a/test/test_cuda.py b/test/test_cuda.py
index d4ad84016229..b484e9a6ff1f 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -3200,6 +3200,36 @@ def run(op, kwargs):
             except Exception as e:
                 raise RuntimeError("Failed on ", op) from e
 
+            # Do the same operations varying seeds
+            seeds = [6, 128, 9999]
+
+            for seed in seeds:
+                torch.cuda.manual_seed(seed)
+                graph_in.copy_(a)
+                for _ in range(3):
+                    g.replay()
+
+                # If the random seed was not updated then the graph would
+                # generate the same output as in previous check.
+                try:
+                    self.assertNotEqual(eager_out, graph_out)
+                except Exception as e:
+                    raise RuntimeError("Failed on ", op) from e
+
+                # Now repeat the same operations in non-graphed mode.
+                torch.cuda.manual_seed(seed)
+                for _ in range(3):
+                    eager_out.copy_(a)
+                    eager_out = op(eager_out, **kwargs)
+                    eager_out = op(eager_out, **kwargs)
+
+                # In the end, graph_out and eager_out must be equal
+                # as they went under the same set of operations.
+                try:
+                    self.assertEqual(eager_out, graph_out)
+                except Exception as e:
+                    raise RuntimeError("Failed on ", op) from e
+
             # We hold references to all tensors used across streams up til this sync,
             # so no need to call record_stream on those tensors.
             torch.cuda.synchronize()
@@ -3283,22 +3313,36 @@ def run(module, op, args, kwargs):
             except Exception as e:
                 raise RuntimeError("Failed on " + module + "." + op) from e
 
-            # Runs a dummy op prelude, as for controls, to make sure replay()
-            # picks up the dummy op's state increment.
-            if module == "torch":
-                dummy = getattr(torch, op)(*args, **kwargs)
-            else:
-                dummy = alloc.clone()
-                getattr(dummy, op)(*args)
+            # Set a new seed to check if graph would use it
+            for seed in [6, 314, 271]:
+                torch.cuda.manual_seed(seed)
+                # Runs a dummy op prelude, as for controls, to make sure replay()
+                # picks up the dummy op's state increment.
+                if (module == "torch"):
+                    dummy = getattr(torch, op)(*args, **kwargs)
+                    control1 = getattr(torch, op)(*args, **kwargs)
+                    control2 = getattr(torch, op)(*args, **kwargs)
+                else:
+                    getattr(dummy, op)(*args)
+                    getattr(control1, op)(*args)
+                    getattr(control2, op)(*args)
 
-            # Runs RNG ops that fill t1 and t2.
-            g.replay()
+                torch.cuda.manual_seed(seed)
+                if (module == "torch"):
+                    dummy = getattr(torch, op)(*args, **kwargs)
+                else:
+                    getattr(dummy, op)(*args)
 
-            try:
-                self.assertEqual(control1, t1)
-                self.assertEqual(control2, t2)
-            except Exception as e:
-                raise RuntimeError("Failed on " + module + "." + op) from e
+                t1.copy_(alloc)
+                t2.copy_(alloc)
+                # Runs RNG ops that fill t1 and t2.
+                g.replay()
+
+                try:
+                    self.assertEqual(control1, t1)
+                    self.assertEqual(control2, t2)
+                except Exception as e:
+                    raise RuntimeError("Failed on " + module + "." + op) from e
 
             # We hold references to all tensors used across streams up til this sync,
             # so no need to call record_stream on those tensors.
@@ -3542,8 +3586,8 @@ def test_graph_memory_stats_and_use_result_after_destroy_graph(self):
              delta_cudaMalloc_bytes_post_del_g,
              pool_string) in cases:
             if pool_string == "small_pool":
-                delta_active_blocks = 2  # one from "b" plus a sneaky one from CUDAGraph's one-element rng offset holder
-                delta_active_bytes = numel * elem + 512  # + 512 for CUDAGraph's rng offset holder
+                delta_active_blocks = 3  # one from "b" plus a sneaky two from CUDAGraph's one-element rng seed and offset holders
+                delta_active_bytes = numel * elem + 1024  # + 1024 for CUDAGraph's rng seed and offset holders each
             else:
                 delta_active_blocks = 1  # We only check the large pool, which isn't affected by rng offset holder
                 delta_active_bytes = numel * elem
@@ -3940,12 +3984,12 @@ def forward(self, x):
         loss.backward()
         optimizer.step()
 
-    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUDA_VISIBLE_DEVICES")
     @unittest.skipIf(TEST_MULTIGPU, "Testing on one GPU is sufficient")
     def test_lazy_init(self):
         """ Validate that no CUDA calls are made during `import torch` call"""
         from subprocess import check_output
-        test_script = "import os; import torch;os.environ['CUDA_VISIBLE_DEVICES']='32';print(torch.cuda.device_count())"
+        VISIBLE_DEVICES = "HIP_VISIBLE_DEVICES" if TEST_WITH_ROCM else "CUDA_VISIBLE_DEVICES"
+        test_script = f"import os; import torch;os.environ['{VISIBLE_DEVICES}']='32';print(torch.cuda.device_count())"
         rc = check_output([sys.executable, '-c', test_script]).decode("ascii").strip()
         self.assertEqual(rc, "0")
 
@@ -4426,6 +4470,60 @@ def test_memory_snapshot(self):
         finally:
             torch.cuda.memory._record_memory_history(False)
 
+
+    def test_allocator_settings(self):
+        def power2_div(size, div_factor):
+            pow2 = 1
+            while pow2 < size:
+                pow2 = pow2 * 2
+            if pow2 == size:
+                return pow2
+            step = pow2 / 2 / div_factor
+            ret = pow2 / 2
+            while ret < size:
+                ret = ret + step
+            return ret
+
+        torch.cuda.memory.empty_cache()
+        key = 'active_bytes.all.allocated'
+
+        nelems = 21 * 1024 * 1024
+        nbytes = 4 * nelems  # floats are 4 bytes
+
+        start_mem = torch.cuda.memory_stats()[key]
+        torch.cuda.memory._set_allocator_settings("")
+        x = torch.rand(nelems, device='cuda')
+
+        reg_mem = torch.cuda.memory_stats()[key]
+        torch.cuda.memory._set_allocator_settings("roundup_power2_divisions:4")
+        y = torch.rand(nelems, device='cuda')
+
+        pow2_div4_mem = torch.cuda.memory_stats()[key]
+
+        self.assertTrue(reg_mem - start_mem == nbytes)
+        self.assertTrue(pow2_div4_mem - reg_mem == power2_div(nbytes, 4))
+
+        torch.cuda.memory._set_allocator_settings("garbage_collection_threshold:0.5")
+        torch.cuda.memory._set_allocator_settings("garbage_collection_threshold:0.5,max_split_size_mb:40")
+
+        # should have reset the power2 divisions now
+        torch.cuda.memory.empty_cache()
+        start_mem = torch.cuda.memory_stats()[key]
+        z = torch.rand(nelems, device='cuda')
+        reg_mem = torch.cuda.memory_stats()[key]
+        self.assertTrue(reg_mem - start_mem == nbytes)
+
+
+        with self.assertRaises(RuntimeError):
+            torch.cuda.memory._set_allocator_settings("foo:1,bar:2")
+
+        with self.assertRaises(RuntimeError):
+            torch.cuda.memory._set_allocator_settings("garbage_collection_threshold:1.2")
+
+        with self.assertRaises(RuntimeError):
+            torch.cuda.memory._set_allocator_settings("max_split_size_mb:2")
+
+
     def test_raises_oom(self):
         with self.assertRaises(torch.cuda.OutOfMemoryError):
             torch.empty(1024 * 1024 * 1024 * 1024, device='cuda')
diff --git a/test/test_cuda_sanitizer.py b/test/test_cuda_sanitizer.py
index e8629788be59..ff89f259e83a 100644
--- a/test/test_cuda_sanitizer.py
+++ b/test/test_cuda_sanitizer.py
@@ -335,6 +335,61 @@ def test_multiple_wait(self):
         self.assert_good_kernel_launch(stream_id(2), read_only=[tensor_id(1)])
         self.assert_good_kernel_launch(stream_id(3), read_only=[tensor_id(1)])
 
+    def test_device_synchronize(self):
+        # Tests that a device synchronization does correctly cause all streams
+        # to synchronize with each other.
+
+        iterations = 10
+        for i in range(1, iterations):
+            self.assert_good_kernel_launch(stream_id(i), read_write=[tensor_id(i)])
+
+        self.handler._handle_device_synchronization()
+        self.assert_good_kernel_launch(
+            stream_id(0), read_write=[tensor_id(i) for i in range(1, iterations)]
+        )
+
+    def test_device_synchronization_expired(self):
+        # Tests that a device synchronization is a one-time synchronization.
+        self.assert_good_kernel_launch(stream_id(1), read_write=[tensor_id(1)])
+        self.handler._handle_device_synchronization()
+        self.assert_good_kernel_launch(stream_id(1), read_write=[tensor_id(1)])
+
+        self.assert_bad_kernel_launch(1, stream_id(2), read_write=[tensor_id(1)])
+
+    def test_new_stream_is_synchronized(self):
+        # Tests that after synchronizing operations with the host, any newly created
+        # stream is guaranteed to be synchronized with them as well.
+
+        self.assert_good_kernel_launch(stream_id(1), read_write=[tensor_id(1)])
+        self.handler._handle_device_synchronization()
+        self.handler._handle_stream_creation(stream_id(2))
+        self.assert_good_kernel_launch(stream_id(2), read_write=[tensor_id(1)])
+
+    def test_stream_synchronize(self):
+        # Tests that a stream synchronization does correctly cause all streams to wait
+        # for one specific stream, but does not synchronize all streams with each other.
+
+        self.assert_good_kernel_launch(stream_id(0), read_write=[tensor_id(1)])
+        self.assert_good_kernel_launch(stream_id(1), read_write=[tensor_id(2)])
+        self.handler._handle_stream_synchronization(stream_id(0))
+
+        self.assert_good_kernel_launch(stream_id(2), read_only=[tensor_id(1)])
+        self.assert_good_kernel_launch(stream_id(3), read_only=[tensor_id(1)])
+        self.assert_bad_kernel_launch(1, stream_id(4), read_only=[tensor_id(2)])
+
+    def test_event_synchronize(self):
+        # Tests that an event synchronization does correctly cause all streams to wait
+        # for a recorded event, but does not guarantee synchronization with the current
+        # state of the stream that recorded the event.
+
+        self.assert_good_kernel_launch(stream_id(1), read_write=[tensor_id(1)])
+        self.handler._handle_event_record(event_id(1), stream_id(1))
+        self.assert_good_kernel_launch(stream_id(1), read_write=[tensor_id(2)])
+
+        self.handler._handle_event_synchronization(event_id(1))
+        self.assert_good_kernel_launch(stream_id(2), read_write=[tensor_id(1)])
+        self.assert_bad_kernel_launch(1, stream_id(2), read_write=[tensor_id(2)])
+
 
 class TestMessages(TestCase):
     def setUp(self):
diff --git a/test/test_cuda_trace.py b/test/test_cuda_trace.py
index 226e30acbfeb..07ba30d27f41 100644
--- a/test/test_cuda_trace.py
+++ b/test/test_cuda_trace.py
@@ -82,6 +82,34 @@ def test_stream_creation_callback(self):
         torch.cuda.Stream()
         self.mock.assert_called()
 
+    def test_device_synchronization_callback(self):
+        cuda_trace.register_callback_for_cuda_device_synchronization(self.mock)
+
+        torch.cuda.synchronize()
+        self.mock.assert_called()
+
+    def test_stream_synchronization_callback(self):
+        cuda_trace.register_callback_for_cuda_stream_synchronization(self.mock)
+
+        stream = torch.cuda.Stream()
+        stream.synchronize()
+        self.mock.assert_called_once_with(stream.cuda_stream)
+
+    def test_event_synchronization_callback(self):
+        cuda_trace.register_callback_for_cuda_event_synchronization(self.mock)
+
+        event = torch.cuda.Event()
+        event.record()
+        event.synchronize()
+        self.mock.assert_called_once_with(event._as_parameter_.value)
+
+    def test_memcpy_synchronization(self):
+        cuda_trace.register_callback_for_cuda_stream_synchronization(self.mock)
+
+        tensor = torch.rand(5, device="cuda")
+        tensor.nonzero()
+        self.mock.assert_called_once_with(torch.cuda.default_stream().cuda_stream)
+
     def test_all_trace_callbacks_called(self):
         other = unittest.mock.MagicMock()
         cuda_trace.register_callback_for_cuda_memory_allocation(self.mock)
diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index 3afe10ddf695..862a0a5de019 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -3166,7 +3166,7 @@ def _fast_forward_graph_test_helper(self, datapipe, fast_forward_fn, expected_re
         if rng is None:
             rng = torch.Generator()
         rng = rng.manual_seed(0)
-        torch.utils.data.graph_settings.apply_shuffle_seed(datapipe, rng)
+        torch.utils.data.graph_settings.apply_random_seed(datapipe, rng)
 
         # Test Case: fast forward works with list
         rng.manual_seed(0)
@@ -3199,7 +3199,7 @@ def test_simple_snapshot_graph(self):
         rng = torch.Generator()
         graph3 = graph2.shuffle()
         rng.manual_seed(0)
-        torch.utils.data.graph_settings.apply_shuffle_seed(graph3, rng)
+        torch.utils.data.graph_settings.apply_random_seed(graph3, rng)
         res3 = list(graph3)
         self._fast_forward_graph_test_helper(graph3, _simple_graph_snapshot_restoration,
                                              expected_res=res3)
@@ -3219,7 +3219,7 @@ def test_simple_snapshot_graph(self):
         cdp1, cdp2 = graph5.fork(2)
         graph6 = cdp1.zip(cdp2)
         rng = rng.manual_seed(100)
-        torch.utils.data.graph_settings.apply_shuffle_seed(graph6, rng)
+        torch.utils.data.graph_settings.apply_random_seed(graph6, rng)
         res6 = [(x, x) for x in res5]
         self._fast_forward_graph_test_helper(graph6, _simple_graph_snapshot_restoration,
                                              expected_res=res6)
@@ -3250,7 +3250,7 @@ def _snapshot_test_helper(self, datapipe, expected_res, n_iter=3, rng=None):
         if rng is None:
             rng = torch.Generator()
         rng.manual_seed(0)
-        torch.utils.data.graph_settings.apply_shuffle_seed(datapipe, rng)
+        torch.utils.data.graph_settings.apply_random_seed(datapipe, rng)
         it = iter(datapipe)
         for _ in range(n_iter):
             next(it)
@@ -3277,7 +3277,7 @@ def test_simple_snapshot_graph_with_serialization(self):
         rng = torch.Generator()
         graph3 = graph2.shuffle()
         rng.manual_seed(0)
-        torch.utils.data.graph_settings.apply_shuffle_seed(graph3, rng)
+        torch.utils.data.graph_settings.apply_random_seed(graph3, rng)
         res3 = list(graph3)
         self._snapshot_test_helper(graph3, expected_res=res3)
 
@@ -3307,13 +3307,13 @@ def test_simple_snapshot_graph_repeated(self):
 
         rng = torch.Generator()
         rng.manual_seed(0)
-        torch.utils.data.graph_settings.apply_shuffle_seed(graph, rng)
+        torch.utils.data.graph_settings.apply_random_seed(graph, rng)
 
         # Get expected result
         expected_res = list(graph)
 
         rng.manual_seed(0)
-        torch.utils.data.graph_settings.apply_shuffle_seed(graph, rng)
+        torch.utils.data.graph_settings.apply_random_seed(graph, rng)
         it = iter(graph)
         n_iter = 3
         for _ in range(n_iter):
diff --git a/test/test_decomp.py b/test/test_decomp.py
index 74413f127b26..ffd44cda92df 100644
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@@ -192,6 +192,8 @@ def op_assert_equal(test_case, op, test_dtype, orig, decomp, args, kwargs):
         (torch.float64, torch.ops.aten.native_layer_norm.default): (1e-6, 1e-6),
         # This exceeds default tolerances only on CPU, on CUDA it's fine
         (torch.float32, torch.ops.aten.grid_sampler_2d.default) : (7e-6, 3e-5),
+        # Exceeds tolerances on CUDA, likely due to fma
+        (torch.float32, torch.ops.aten.mv.default) : (1e-5, 3e-5),
     }
     if (test_dtype, op) in tol_table:
         rtol, atol = tol_table[(decomp.dtype, op)]
@@ -275,6 +277,8 @@ def normalize_op_input_output(f, sample, requires_grad=True):
     ("cuda", torch.float64, "nn.functional.dropout"),
     ("cuda", torch.float32, "nn.functional.dropout"),
     (None, None, "new_empty"),
+    (None, None, "empty_like"),
+    (None, None, "empty"),
     # decomp has problem even with opmath
     # doesn't work
     ("cuda", torch.bfloat16, "nn.functional.embedding"),
@@ -396,6 +400,8 @@ def _torch_dispatch(cls, func, types, args=(), kwargs=None):
                 if func not in decomposition_table or func in [
                     torch.ops.aten.detach.default,
                     # non-deterministic ops
+                    torch.ops.aten.empty.memory_format,
+                    torch.ops.aten.empty_like.default,
                     torch.ops.aten.new_empty.default
                 ] or any_unsupported(args, kwargs):
                     return func(*args, **kwargs)
diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index f568e166c6cc..323966e777a5 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -89,9 +89,6 @@ def __new__(cls, sym_shape, sym_strides, dtype, layout, requires_grad, device):
             dtype=dtype, layout=layout, requires_grad=requires_grad,
             device=device,
         )
-
-        r.sym_shape = sym_shape
-        r.sym_stride = sym_stride
         return r
 
     __torch_function__ = _disabled_torch_function_impl
@@ -104,22 +101,6 @@ def __torch_dispatch__(cls, func_overload, types, args=(), kwargs=None):
         if func_overload in meta_funcs:
             return meta_funcs[func_overload](*args, **kwargs)
 
-        if func_overload == torch.ops.aten.sym_size.default:
-            self = args[0]
-            return self.sym_shape
-
-        if func_overload == torch.ops.aten.sym_stride.default:
-            self = args[0]
-            return self.sym_stride
-
-        # some calls can be redirected to `sym_size` rather than
-        # `sym_sizes`. `sym_size` uses `dim` to canonicalize an index
-        # so we need to implement both `sym_size` and `dim` for python
-        # tensors
-        if func_overload == torch.ops.aten.dim.default:
-            self = args[0]
-            return len(self.sym_shape)
-
         if func_overload == torch.ops.aten.new_empty.default:
             self = args[0]
             shape = args[1]
@@ -134,7 +115,7 @@ def create_symbolic_tensor(name, arg, shape_env):
     return FakeSymbolicTensor(sym_shapes, sym_strides, arg.dtype, arg.layout, arg.requires_grad, arg.device)
 
 
-CPP_SYMINT_CLASS = type(torch._C.SymIntNode.new_symint(1))
+CPP_SYMINT_CLASS = type(torch.SymIntNode.new_symint(1))
 
 
 @skipIfTorchDynamo("Creating ShapeEnv fails for confusing reasons (also we never expect dynamo to see code like this)")
@@ -319,7 +300,6 @@ def forward(self, x):
         # tuple of ints, not tuple
         torch.fx.symbolic_trace(m)
 
-
     @skipIfNoSympy
     def test_meta_symint(self):
         shape_env = ShapeEnv()
@@ -327,6 +307,20 @@ def test_meta_symint(self):
         r = torch.empty(a0, device='meta')
         self.assertIsInstance(r.shape[0], CPP_SYMINT_CLASS)
 
+    @skipIfNoSympy
+    def test_guard_int(self):
+        shape_env = ShapeEnv()
+        a0 = shape_env.create_symint("a0", 2)
+        self.assertEqual(a0.guard_int(), 2)
+        self.assertEqual(str(shape_env.guards[0][0]), "a0")
+        self.assertEqual(shape_env.guards[0][1], 2)
+
+    @skipIfNoSympy
+    def test_int_conversion(self):
+        shape_env = ShapeEnv()
+        a0 = shape_env.create_symint("a0", 2)
+        self.assertRaisesRegex(RuntimeError, "Trying to extract", lambda: int(a0))
+
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index 913ae28b78ae..b188bf15ae97 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -602,6 +602,17 @@ def test_tensor_constructors_all_have_kwarg_device(self):
                 has_kwarg_device or op == torch.ops.aten._list_to_tensor.default
             )
 
+    @unittest.expectedFailure
+    def test_sparse_new(self):
+        with FakeTensorMode():
+            indices = torch.randn(1, 1, dtype=torch.int64)
+            values = torch.randn(1)
+            extra = (2,)
+            sparse = torch.randn(1).to_sparse()
+            # This used to segfault, now it does not, but it still raises an
+            # error
+            sparse2 = sparse.new(indices, values, extra)
+
     def test_like_ops(self):
         for schema in self.get_all_aten_schemas():
             if "_like" == schema.name[-5:]:
diff --git a/test/test_function_schema.py b/test/test_function_schema.py
index f094bb242267..c5717a24da34 100644
--- a/test/test_function_schema.py
+++ b/test/test_function_schema.py
@@ -192,5 +192,19 @@ def test_tensor_list_alias_annotation_properly_parsed(self):
         self.assertTrue(schema.arguments[-1].alias_info.is_write)
         self.assertEqual(str(schema), schema_str)
 
+    def test_tensor_option_arguments_properly_parsed(self):
+        schema_str = '_to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, ' \
+                     'bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor'
+        schema = parse_schema(schema_str)
+        # fake type of MemoryFormat? is int?
+        self.assertEqual(schema.arguments[-1].type.str(), "int?")
+        # fake type of Layout? is int?
+        self.assertEqual(schema.arguments[2].type.str(), "int?")
+        # fake type of Device? is Device?
+        self.assertEqual(schema.arguments[3].type.str(), "Device?")
+        # print real types in FunctionSchema
+        self.assertEqual(str(schema), schema_str)
+
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_fx.py b/test/test_fx.py
index 123d3635048a..2a8818038a81 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -120,6 +120,16 @@ def wrapped_via_decorator(a):
 def wrapped_with_submodule(x: torch.Tensor, batchnorm1d: torch.nn.BatchNorm1d):
     return batchnorm1d(x)
 
+def my_decorator(f):
+    @functools.wraps(f)
+    def wrapper_inside_decorator(*args, **kwargs):
+        return f(*args, **kwargs)
+    return wrapper_inside_decorator
+
+@wrap
+@my_decorator
+def wrapped_decorated_fn(x):
+    return x
 
 real_wrapped_via_decorator = wrapped_via_decorator
 real_a_lifed_leaf = a_lifted_leaf
@@ -448,6 +458,14 @@ def to_trace(y):
         self.assertIn('wrapped_via_decorator', retraced.code)
         self.assertEqual(retraced(0), 1)
 
+    def test_wrap_decorated_function(self):
+        def to_trace(y):
+            return wrapped_decorated_fn(y)
+
+        m = symbolic_trace(to_trace)
+        self.assertIn('wrapped_decorated_fn', m.code)
+        self.assertEqual(m(1), 1)
+
     def test_graph_edit_with_proxy(self):
         class M(torch.nn.Module):
             def forward(self, a, b):
diff --git a/test/test_fx_reinplace_pass.py b/test/test_fx_reinplace_pass.py
index c508d89ec703..abb9696225c4 100644
--- a/test/test_fx_reinplace_pass.py
+++ b/test/test_fx_reinplace_pass.py
@@ -143,19 +143,19 @@ def f(a_):
 
 
 def forward(self, a__1):
-    clone_default = torch.ops.aten.clone.default(a__1);  a__1 = None
-    view_default = torch.ops.aten.view.default(clone_default, [-1])
-    view_default_1 = torch.ops.aten.view.default(clone_default, [-1])
-    select_int = torch.ops.aten.select.int(view_default_1, 0, 0);  view_default_1 = None
-    view_default_2 = torch.ops.aten.view.default(select_int, [-1]);  select_int = None
-    add_tensor = torch.ops.aten.add_.Tensor(view_default_2, 1)
-    view_default_3 = torch.ops.aten.view.default(clone_default, [-1]);  clone_default = None
-    select_int_1 = torch.ops.aten.select.int(view_default_3, 0, 0)
-    view_default_4 = torch.ops.aten.view.default(view_default_2, []);  view_default_2 = None
-    view_default_5 = torch.ops.aten.view.default(view_default_3, [4]);  view_default_3 = None
-    view_default_6 = torch.ops.aten.view.default(view_default_5, [-1])
-    add_tensor_1 = torch.ops.aten.add_.Tensor(view_default_5, view_default_6);  view_default_6 = None
-    return view_default_5
+    clone = torch.ops.aten.clone.default(a__1);  a__1 = None
+    view = torch.ops.aten.view.default(clone, [-1])
+    view_1 = torch.ops.aten.view.default(clone, [-1])
+    select = torch.ops.aten.select.int(view_1, 0, 0);  view_1 = None
+    view_2 = torch.ops.aten.view.default(select, [-1]);  select = None
+    add = torch.ops.aten.add_.Tensor(view_2, 1)
+    view_3 = torch.ops.aten.view.default(clone, [-1]);  clone = None
+    select_1 = torch.ops.aten.select.int(view_3, 0, 0)
+    view_4 = torch.ops.aten.view.default(view_2, []);  view_2 = None
+    view_5 = torch.ops.aten.view.default(view_3, [4]);  view_3 = None
+    view_6 = torch.ops.aten.view.default(view_5, [-1])
+    add_1 = torch.ops.aten.add_.Tensor(view_5, view_6);  view_6 = None
+    return view_5
     """)
 
     def test_reinplace_scatter_twice(self):
@@ -180,14 +180,14 @@ def f(a_):
 
 
 def forward(self, a__1):
-    clone_default = torch.ops.aten.clone.default(a__1);  a__1 = None
-    slice_tensor = torch.ops.aten.slice.Tensor(clone_default, 0, 0, 9223372036854775807)
-    select_int = torch.ops.aten.select.int(slice_tensor, 1, 1);  slice_tensor = None
-    select_int_1 = torch.ops.aten.select.int(select_int, 0, 1);  select_int = None
-    add_tensor = torch.ops.aten.add_.Tensor(select_int_1, 1);  select_int_1 = None
-    slice_tensor_1 = torch.ops.aten.slice.Tensor(clone_default, 0, 0, 9223372036854775807)
-    select_int_2 = torch.ops.aten.select.int(slice_tensor_1, 1, 1);  slice_tensor_1 = None
-    return clone_default
+    clone = torch.ops.aten.clone.default(a__1);  a__1 = None
+    slice_1 = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
+    select = torch.ops.aten.select.int(slice_1, 1, 1);  slice_1 = None
+    select_1 = torch.ops.aten.select.int(select, 0, 1);  select = None
+    add = torch.ops.aten.add_.Tensor(select_1, 1);  select_1 = None
+    slice_2 = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
+    select_2 = torch.ops.aten.select.int(slice_2, 1, 1);  slice_2 = None
+    return clone
     """)
 
     def test_reinplace_scatter_twice_with_different_view_op_valid(self):
@@ -319,8 +319,8 @@ def f():
 
 def forward(self):
     zeros = torch.ops.aten.zeros.default([2, 2], device = device(type='cpu'), pin_memory = False)
-    diagonal_default = torch.ops.aten.diagonal.default(zeros)
-    add_tensor = torch.ops.aten.add_.Tensor(diagonal_default, 1);  diagonal_default = None
+    diagonal = torch.ops.aten.diagonal.default(zeros)
+    add = torch.ops.aten.add_.Tensor(diagonal, 1);  diagonal = None
     return [zeros]
     """)
 
@@ -343,11 +343,11 @@ def f():
 def forward(self):
     zeros = torch.ops.aten.zeros.default([4, 4, 4], device = device(type='cpu'), pin_memory = False)
     ones = torch.ops.aten.ones.default([4, 2, 4], device = device(type='cpu'), pin_memory = False)
-    slice_tensor = torch.ops.aten.slice.Tensor(zeros, 0, 0, 9223372036854775807)
-    slice_tensor_1 = torch.ops.aten.slice.Tensor(slice_tensor, 1, 2, 9223372036854775807);  slice_tensor = None
-    slice_tensor_2 = torch.ops.aten.slice.Tensor(zeros, 0, 0, 9223372036854775807)
-    slice_tensor_3 = torch.ops.aten.slice.Tensor(slice_tensor_2, 1, 2, 9223372036854775807);  slice_tensor_2 = None
-    copy__default = torch.ops.aten.copy_.default(slice_tensor_3, ones);  slice_tensor_3 = ones = None
+    slice_1 = torch.ops.aten.slice.Tensor(zeros, 0, 0, 9223372036854775807)
+    slice_2 = torch.ops.aten.slice.Tensor(slice_1, 1, 2, 9223372036854775807);  slice_1 = None
+    slice_3 = torch.ops.aten.slice.Tensor(zeros, 0, 0, 9223372036854775807)
+    slice_tensor = torch.ops.aten.slice.Tensor(slice_3, 1, 2, 9223372036854775807);  slice_3 = None
+    copy__default = torch.ops.aten.copy_.default(slice_tensor, ones);  slice_tensor = ones = None
     return zeros
     """)
 
diff --git a/test/test_indexing.py b/test/test_indexing.py
index daa0aabb06ac..0d1022bc24eb 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -11,7 +11,7 @@
 import numpy as np
 
 from torch.testing import make_tensor
-from torch.testing._internal.common_utils import TestCase, run_tests, skipIfTorchDynamo
+from torch.testing._internal.common_utils import TestCase, run_tests
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCUDA, dtypes, dtypesIfCPU, dtypesIfCUDA,
     onlyNativeDeviceTypes)
@@ -1492,7 +1492,6 @@ def test_boolean_indexing_weirdness_tensors(self, device):
         self.assertEqual(torch.ones(1, 2, device=device), a[true, [0, 1], true, true, [1], [[2]]])
         self.assertRaises(IndexError, lambda: a[false, [0, 1], ...])
 
-    @skipIfTorchDynamo("Waiting on https://github.com/pytorch/pytorch/pull/83567")
     def test_boolean_indexing_alldims(self, device):
         true = torch.tensor(True, device=device)
         a = torch.ones((2, 3), device=device)
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 09cc3b7e30f5..22ef5b0421a3 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -1388,6 +1388,8 @@ def apply(fn):
                 F.hardswish,
                 F.softplus,
                 F.silu,
+                F.mish,
+                F.elu,
                 torch.sqrt,
                 torch.rsqrt,
                 torch.abs,
diff --git a/test/test_jit_llga_fuser.py b/test/test_jit_llga_fuser.py
index 1e79b745d2c1..83de858b4969 100644
--- a/test/test_jit_llga_fuser.py
+++ b/test/test_jit_llga_fuser.py
@@ -481,6 +481,50 @@ def forward(self, x, y):
             graph, _ = self.checkTrace(m, [x, y])
 
 
+@unittest.skipIf(LLGA_NOT_ENABLED, "MKL-DNN build is disabled")
+class TestEnableDisableLlgaFuser(JitTestCase):
+    def setUp(self):
+        super().setUp()
+        self.is_enabled = torch._C._jit_set_llga_enabled(False)
+
+    def tearDown(self):
+        torch._C._jit_set_llga_enabled(self.is_enabled)
+        super().tearDown()
+
+    def test_context_manager(self):
+        x = torch.randn(4, 8)
+        y = torch.randn(4, 8)
+        with torch.jit.fuser('fuser3'):
+            with torch.jit.fuser('fuser3'):
+
+                def t1(x, y):
+                    o = x + y
+                    o = o + 2.0
+                    return o
+                t_jit = torch.jit.script(t1)
+                t_jit(x, y)
+                t_jit(x, y)
+                self.assertGraphContains(t_jit.graph_for(x, y), LLGA_FUSION_GROUP)
+
+            def t2(x, y):
+                o = x + y
+                o = o + 3.0
+                return o
+            t_jit_2 = torch.jit.script(t2)
+            t_jit_2(x, y)
+            t_jit_2(x, y)
+            self.assertGraphContains(t_jit_2.graph_for(x, y), LLGA_FUSION_GROUP)
+
+        def t3(x, y):
+            o = x + y
+            o = o + 4.0
+            return o
+        t_jit_3 = torch.jit.script(t3)
+        t_jit_3(x, y)
+        t_jit_3(x, y)
+        self.assertGraphContainsExactly(t_jit_3.graph_for(x, y), LLGA_FUSION_GROUP, 0)
+
+
 @unittest.skipIf(LLGA_NOT_ENABLED, "MKL-DNN build is disabled")
 class TestModel(JitLlgaTestCase):
     @skipIfNoTorchVision
diff --git a/test/test_linalg.py b/test/test_linalg.py
index fce5da2f42bb..c546da5e85ff 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -148,6 +148,13 @@ def test_solve_removed_error(self, device):
         with self.assertRaisesRegex(RuntimeError, "This function was deprecated since version 1.9 and is now removed"):
             b.solve(a)
 
+    def test_eig_removed_error(self, device):
+        a = make_tensor(5, 5, device=device, dtype=torch.float32)
+        with self.assertRaisesRegex(RuntimeError, "This function was deprecated since version 1.9 and is now removed"):
+            torch.eig(a)
+        with self.assertRaisesRegex(RuntimeError, "This function was deprecated since version 1.9 and is now removed"):
+            a.eig()
+
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)
@@ -1758,122 +1765,6 @@ def test_norm_fastpaths(self, device):
         expected = torch.pow(x.pow(3).abs().sum(1), 1.0 / 3.0)
         self.assertEqual(result, expected)
 
-    @skipCPUIfNoLapack
-    @skipCUDAIfNoMagma
-    @dtypes(*floating_and_complex_types())
-    def test_old_eig_basic(self, device, dtype):
-        a = torch.tensor([[1.96, 0.00, 0.00, 0.00, 0.00],
-                          [-6.49, 3.80, 0.00, 0.00, 0.00],
-                          [-0.47, -6.39, 4.17, 0.00, 0.00],
-                          [-7.20, 1.50, -1.51, 5.70, 0.00],
-                          [-0.65, -6.34, 2.67, 1.80, -7.10]],
-                         dtype=dtype, device=device).t()
-        e = torch.eig(a)[0]
-        ee, vv = torch.eig(a, True)
-        te = torch.tensor((), dtype=dtype, device=device)
-        tv = torch.tensor((), dtype=dtype, device=device)
-        eee, vvv = torch.eig(a, True, out=(te, tv))
-        self.assertEqual(e, ee, atol=1e-12, rtol=0)
-        self.assertEqual(ee, eee, atol=1e-12, rtol=0)
-        self.assertEqual(ee, te, atol=1e-12, rtol=0)
-        self.assertEqual(vv, vvv, atol=1e-12, rtol=0)
-        self.assertEqual(vv, tv, atol=1e-12, rtol=0)
-        #
-        # compare with numpy
-        np_e, np_v = np.linalg.eig(a.cpu().numpy())
-        if dtype.is_complex:
-            self.assertEqual(ee, np_e)
-        else:
-            # np_e.shape == (n, 2), where each column contain the real and
-            # imaginary parts of the result
-            self.assertEqual(ee[:, 0], np_e)  # real part
-            self.assertEqual(ee[:, 1], torch.zeros(ee.shape[0], dtype=dtype))  # imaginary part
-        self.assertEqual(vv, np_v)
-
-    @skipCPUIfNoLapack
-    @skipCUDAIfNoMagma
-    @dtypes(torch.double, torch.float)
-    def test_old_eig_reuse(self, device, dtype):
-        X = torch.randn(4, 4, dtype=dtype, device=device)
-        X = torch.mm(X.t(), X)
-        e = torch.zeros(4, 2, dtype=dtype, device=device)
-        v = torch.zeros(4, 4, dtype=dtype, device=device)
-        torch.eig(X, True, out=(e, v))
-        Xhat = np.matmul(np.matmul(v.cpu(), torch.diag(e.select(1, 0)).cpu()), v.t().cpu())
-        if dtype is torch.float:
-            atol = 1e-7
-            rtol = 1e-5
-        else:
-            atol = 1e-8
-            rtol = 0
-        self.assertEqual(X, Xhat, atol=atol, rtol=rtol, msg='VeV\' wrong')
-        self.assertTrue(v.is_contiguous(), 'V is not contiguous')
-
-        torch.eig(X, True, out=(e, v))
-        Xhat = np.matmul(v.cpu(), np.matmul(e.select(1, 0).diag().cpu(), v.t().cpu()))
-        self.assertEqual(X, Xhat, atol=atol, rtol=rtol, msg='VeV\' wrong')
-        self.assertTrue(v.is_contiguous(), 'V is not contiguous')
-
-    @skipCPUIfNoLapack
-    @skipCUDAIfNoMagma
-    @dtypes(torch.double, torch.float)
-    def test_old_eig_invalid_input(self, device, dtype):
-        # test invalid input
-        self.assertRaisesRegex(
-            RuntimeError,
-            'input should be 2 dimensional',
-            lambda: torch.eig(torch.ones((2))))
-        self.assertRaisesRegex(
-            RuntimeError,
-            'input should be square',
-            lambda: torch.eig(torch.ones((2, 3))))
-        self.assertRaisesRegex(
-            RuntimeError,
-            'input should not contain infs or NaNs',
-            lambda: torch.eig(np.inf * torch.ones((2, 2))))
-        self.assertRaisesRegex(
-            RuntimeError,
-            'input should not contain infs or NaNs',
-            lambda: torch.eig(np.nan * torch.ones((2, 2))))
-
-    @skipCUDAIfNoMagma
-    @skipCPUIfNoLapack
-    @dtypes(torch.double, torch.float)
-    def test_old_eig_out(self, device, dtype):
-        # the out version of torch.eig needs to be tested manually: we can't
-        # use the "test_out=True" parameter to tensor_op_tests because the
-        # signature is irregular (since we have *two* output vectors)
-        t = torch.randn(10, 10, dtype=dtype, device=device)
-        evals, evecs = torch.eig(t, eigenvectors=True)
-        #
-        # check that the out= version computes the same values as the normal one
-        out_evals = torch.empty_like(evals)
-        out_evecs = torch.empty_like(evecs)
-        evals2, evecs2 = torch.eig(t, eigenvectors=True, out=(out_evals, out_evecs))
-        # check that the out tensors were used in-place
-        self.assertEqual(evals2.data_ptr(), out_evals.data_ptr())
-        self.assertEqual(evecs2.data_ptr(), out_evecs.data_ptr())
-        # check that the result is the same as the non-out version
-        self.assertEqual(evals, out_evals)
-        self.assertEqual(evecs, out_evecs)
-        #
-        # check what happens in the eigenvectors=False case
-        out_evals = torch.empty_like(evals)
-        out_evecs = torch.tensor([1, 2, 3], dtype=dtype, device=device)
-        evals2, evecs2 = torch.eig(t, eigenvectors=False, out=(out_evals, out_evecs))
-        # check that the out_evals was used in-place
-        self.assertEqual(evals2.data_ptr(), out_evals.data_ptr())
-        self.assertEqual(evals, out_evals)
-        # check that out_evecs was NOT touched at all
-        assert out_evecs.tolist() == [1, 2, 3]
-        #
-        # check that we complain if we pass an out vector of the wrong dtype
-        wrong_out = torch.empty((0, 0), dtype=int)
-        with self.assertRaisesRegex(RuntimeError, r"Expected .* but got .*"):
-            torch.eig(t, eigenvectors=True, out=(wrong_out, out_evecs))
-        with self.assertRaisesRegex(RuntimeError, r"Expected .* but got .*"):
-            torch.eig(t, eigenvectors=True, out=(out_evals, wrong_out))
-
     @skipCPUIfNoLapack
     @skipCUDAIfNoMagma
     # NumPy computes only in float64 and complex128 precisions
@@ -7407,12 +7298,6 @@ def fn(torchfn, *args):
         self.assertEqual((torch.tensor(1., device=device), torch.tensor(0., device=device)),
                          fn(torch.slogdet, (0, 0)))
 
-        # eig, symeig
-        evalues, evectors = fn(torch.eig, (0, 0), True)
-        self.assertEqual([(0, 2), (0, 0)], [evalues.shape, evectors.shape])
-        evalues, evectors = fn(torch.symeig, (0, 0), True)
-        self.assertEqual([(0,), (0, 0)], [evalues.shape, evectors.shape])
-
         # lstsq
         self.assertRaises(RuntimeError, lambda: torch.lstsq(torch.randn(0, 0), torch.randn(0, 0)))
         self.assertRaises(RuntimeError, lambda: torch.lstsq(torch.randn(0,), torch.randn(0, 0)))
diff --git a/test/test_maskedtensor.py b/test/test_maskedtensor.py
index e560edae020f..13f9edf578d7 100644
--- a/test/test_maskedtensor.py
+++ b/test/test_maskedtensor.py
@@ -242,7 +242,7 @@ def test_max_not_implemented(self):
         d = torch.tensor([[0, 1, 2], [3, 4, 5.0]])
         m = torch.tensor([[True, False, False], [False, True, False]])
         mt = MaskedTensor(d, m)
-        with self.assertRaisesRegex(TypeError, "no implementation found for 'torch.ops.aten.max'"):
+        with self.assertRaisesRegex(TypeError, "no implementation found for 'torch._ops.aten.max.default'"):
             mt.max()
 
     def test_sum(self):
diff --git a/test/test_meta.py b/test/test_meta.py
index eb4bfb566d83..cd912f67a22b 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -443,7 +443,6 @@ def run_meta_crossref(
     torch.cholesky : {f64, f32, c128, c64},
     torch.cholesky_inverse : {f64, f32, c128, c64},
     torch.cholesky_solve : {f64, f32, c128, c64},
-    torch.eig : {f64, f32, c128, c64},
     torch.linalg.eig : {f64, f32, c128, c64},
     torch.linalg.eigvals : {f64, f32, c128, c64},
     torch.linalg.lstsq : {f64, f32, c128, c64},
@@ -495,6 +494,7 @@ def run_meta_crossref(
     torch.linalg.svd : {c128, c64},
     torch.matmul : {bf16, c128, f64, f32, f16, c64},
     torch.nanquantile : {f64, f32},
+    torch.narrow : {bf16, i8, i64, u8, c128, b8, f64, i16, i32, f32, f16, c32, c64},
     torch.nn.functional.batch_norm : {f64, f32},
     torch.nn.functional.binary_cross_entropy : {bf16, f64, f32, f16},
     torch.nn.functional.dropout3d : {bf16, f64, f32, f16},
@@ -633,7 +633,6 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
     aten.cholesky_solve.out : {c64, c128, f64, f32},
     aten.count_nonzero.default : {c64, f16, i8, f64, c128, i64, bf16, f32, i32, b8, i16, u8},
     aten.count_nonzero.dim_IntList : {c64, f16, i8, f64, c128, i64, bf16, f32, i32, b8, i16, u8},
-    aten.eig.default : {c64, c128, f64, f32},
     aten.geqrf.default : {c64, c128, f64, f32},
     aten.linalg_eig.default : {c64, c128, f64, f32},
     aten.linalg_householder_product.default : {c64, c128, f64, f32},
@@ -660,7 +659,7 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
     aten._histogramdd_bin_edges.default : {f32, f64},
     aten._histogramdd_from_bin_cts.default : {f32, f64},
     aten._histogramdd_from_bin_tensors.default : {f32, f64},
-    aten._local_scalar_dense.default : {c64, f16, i8, f64, c128, i64, bf16, f32, i32, b8, i16, u8},
+    aten._local_scalar_dense.default : {c32, c64, f16, i8, f64, c128, i64, bf16, f32, i32, b8, i16, u8},
     aten._pdist_forward.default : {f32, f64},
     aten._unique2.default : {i8, f64, i64, bf16, f32, i32, b8, i16, u8},
     aten.bincount.default : {i64, i8, i32, i16, u8},
diff --git a/test/test_mkldnn.py b/test/test_mkldnn.py
index 4f9518d8fd03..04a213b1a13d 100644
--- a/test/test_mkldnn.py
+++ b/test/test_mkldnn.py
@@ -299,7 +299,7 @@ def test_conv2d_bf16(self):
     def test_conv3d_bf16(self):
         self._test_conv_bf16_base(dim=3)
 
-    def _test_conv2d_nhwc_base(self, dtype):
+    def _test_conv2d_nhwc_base(self, weight_memory_format, dtype):
         conv_module = torch.nn.Conv2d
         input_shapes = (224, 224)
         options = itertools.product([True, False], [True, False], [1, 2], [1, 4])
@@ -319,7 +319,7 @@ def _test_conv2d_nhwc_base(self, dtype):
                                 dilation=dilation,
                                 bias=bias,
                                 groups=groups).to(dtype=dtype)
-            conv2 = copy.deepcopy(conv1).to(memory_format=torch.channels_last)
+            conv2 = copy.deepcopy(conv1).to(memory_format=weight_memory_format)
             x1 = x.clone()
             x2 = x.clone().to(memory_format=torch.channels_last)
             if train:
@@ -341,13 +341,15 @@ def _test_conv2d_nhwc_base(self, dtype):
                 self.assertEqual(x1.grad, x2.grad)
 
     def test_conv2d_nhwc(self):
-        self._test_conv2d_nhwc_base(dtype=torch.float32)
+        self._test_conv2d_nhwc_base(torch.contiguous_format, dtype=torch.float32)
+        self._test_conv2d_nhwc_base(torch.channels_last, dtype=torch.float32)
 
     @unittest.skipIf(IS_WINDOWS, "Limit support for bf16 path")
     def test_conv2d_nhwc_bf16(self):
         # when has_bf16_support() returns false, bf16 CPU conv will fall back to thnn impl
         if has_bf16_support():
-            self._test_conv2d_nhwc_base(dtype=torch.bfloat16)
+            self._test_conv2d_nhwc_base(torch.contiguous_format, dtype=torch.bfloat16)
+            self._test_conv2d_nhwc_base(torch.channels_last, dtype=torch.bfloat16)
 
     def test_conv2d_legacy_jit_model(self):
         """
diff --git a/test/test_model_dump.py b/test/test_model_dump.py
index a8add0e2cd92..3c682b6ce680 100644
--- a/test/test_model_dump.py
+++ b/test/test_model_dump.py
@@ -131,6 +131,8 @@ def test_main(self):
 
         with tempfile.NamedTemporaryFile() as tf:
             torch.jit.save(torch.jit.script(SimpleModel()), tf)
+            # Actually write contents to disk so we can read it below
+            tf.flush()
 
             stdout = io.StringIO()
             torch.utils.model_dump.main(
diff --git a/test/test_mps.py b/test/test_mps.py
index 0b9e97a0794e..e036f6970cf2 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -23,7 +23,7 @@
      TEST_WITH_UBSAN, dtype_abbrs)
 from torch.testing import make_tensor
 from torch.testing._comparison import TensorLikePair
-from torch.testing._internal.common_dtype import get_all_dtypes
+from torch.testing._internal.common_dtype import get_all_dtypes, integral_types
 import torch.backends.mps
 from torch.distributions import Uniform, Exponential
 from functools import partial
@@ -1578,6 +1578,13 @@ def test_full_bugs(self):
         y_cpu = torch.full((2, 2), 247, device='cpu', dtype=torch.uint8)
         self.assertEqual(y_mps, y_cpu)
 
+    # See https://github.com/pytorch/pytorch/issues/84995
+    def test_div_bugs(self):
+        for (dtype, mode) in itertools.product(integral_types(), ['trunc', 'floor']):
+            x = torch.tensor(list(range(1, 11)), device='mps', dtype=dtype)
+            y = torch.div(x, 101, rounding_mode=mode)
+            self.assertEqual(y.sum(), 0)
+
     # See https://github.com/pytorch/pytorch/issues/82663
     def test_bool_expand(self):
         x = torch.tensor([[1], [0]], dtype=torch.bool, device='mps')
@@ -3208,11 +3215,18 @@ def helper(n, c, h, w):
 
     def test_divmode(self):
         def helper(shape, rounding_mode):
-            for dtype in [torch.float32]:
-                cpu_x = torch.randn(shape, device='cpu', dtype=dtype, requires_grad=False)
+            for dtype in [torch.float32, torch.float16, torch.int32, torch.int64]:
+                cpu_x = None
+                cpu_y = None
+                if(dtype in [torch.float32, torch.float16]):
+                    cpu_x = torch.randn(shape, device='cpu', dtype=dtype, requires_grad=False)
+                    cpu_y = torch.randn(shape, device='cpu', dtype=dtype, requires_grad=False)
+                else:
+                    cpu_x = torch.randint(-10, 0, shape, device='cpu', dtype=dtype, requires_grad=False)
+                    cpu_y = torch.randint(-10, 0, shape, device='cpu', dtype=dtype, requires_grad=False)
+
                 mps_x = cpu_x.detach().clone().to('mps')
                 # clamp to avoid division by 0
-                cpu_y = torch.randn(shape, device='cpu', dtype=dtype, requires_grad=False)
                 mps_y = cpu_y.detach().clone().to('mps')
 
                 result_div_cpu = torch.div(cpu_x, cpu_y, rounding_mode=rounding_mode)
@@ -3837,6 +3851,29 @@ def helper(shape):
         for shape in [[], (2, 3), (2, 8, 4, 5)]:
             helper(shape)
 
+    def test_cast_mps_to_cpu(self):
+        def helper(src_dtype, dst_dtype):
+            input = torch.rand((1, 3, 128, 128), dtype=src_dtype)
+            input_cast_mps = input.to('mps')
+            input_cast_cpu = input_cast_mps.to('cpu', dtype=dst_dtype)
+
+            # needs to match the initial Tensor
+            self.assertEqual(input_cast_cpu, input.to(dtype=dst_dtype))
+        helper(torch.half, torch.float)
+        helper(torch.float, torch.half)
+
+    def test_cast_mps_to_mps(self):
+        def helper(src_dtype, dst_dtype):
+            input_cpu = torch.rand((1, 3, 128, 128), dtype=src_dtype)
+            input_mps = input_cpu.to('mps')
+            output_mps = input_mps.to(dtype=dst_dtype)
+            output_cpu = input_cpu.to(dtype=dst_dtype)
+            self.assertEqual(output_mps.cpu(), output_cpu)
+        helper(torch.half, torch.float)
+        helper(torch.float, torch.half)
+        helper(torch.half, torch.long)
+        helper(torch.float, torch.int)
+
     # Test adaptive avg pool2d - when the input size is a multiple of output size
     # Not testing for channels last right now
     def test_adaptive_avg_pool2d_simple(self):
@@ -5064,6 +5101,24 @@ def test_slicing_with_step(self):
 
         self.assertEqual(x_cpu, x_mps)
 
+    def test_cast_gather_scatter(self):
+        for _ in range(0, 50):
+            input = np.random.randint(0, 255, size=(5, 5, 4), dtype=np.uint8)
+            with torch.no_grad():
+                s = torch.tensor(input, dtype=torch.uint8, device="mps").unsqueeze(0)
+                s_cpu = torch.tensor(input, dtype=torch.uint8, device="cpu").unsqueeze(0)
+                s = s.long()
+                s_cpu = s_cpu.long()
+                self.assertEqual(s.cpu(), s_cpu)
+
+                s = s.float()
+                s_cpu = s_cpu.float()
+                self.assertEqual(s.cpu(), s_cpu)
+
+                s /= 255
+                s_cpu /= 255
+                self.assertEqual(s.cpu(), s_cpu)
+
     def test_slicing_replace_column(self):
         # https://github.com/pytorch/pytorch/issues/78074
         def _helper(tensor_data):
@@ -6272,7 +6327,7 @@ def _get_not_implemented_op(self):
     def test_error_on_not_implemented(self):
         fn, args, kwargs, _ = self._get_not_implemented_op()
 
-        with self.assertRaisesRegex(NotImplementedError, "not current implemented for the MPS device"):
+        with self.assertRaisesRegex(NotImplementedError, "not currently implemented for the MPS device"):
             fn(*args, **kwargs)
 
     def test_warn_on_not_implemented_with_fallback(self):
@@ -6507,7 +6562,6 @@ class TestConsistency(TestCase):
         'isreal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'kron': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'linalg.matrix_norm': ['f16'],
-        'linalg.norm': ['f16', 'f32'],
         'linalg.svd': ['f32'],
         'linalg.vector_norm': ['f16', 'f32'],
         'linspace': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -6723,7 +6777,6 @@ class TestConsistency(TestCase):
         'isreal': ['f16', 'f32'],
         'kron': ['f32'],
         'linalg.matrix_norm': ['f16'],
-        'linalg.norm': ['f16', 'f32'],
         'linalg.svd': ['f32'],
         'linspace': ['f16', 'f32'],
         'log': ['f32'],
diff --git a/test/test_namedtuple_return_api.py b/test/test_namedtuple_return_api.py
index 65d90625e5dc..00409cf8dd4d 100644
--- a/test/test_namedtuple_return_api.py
+++ b/test/test_namedtuple_return_api.py
@@ -13,7 +13,7 @@
 path = os.path.dirname(os.path.realpath(__file__))
 aten_native_yaml = os.path.join(path, '../aten/src/ATen/native/native_functions.yaml')
 all_operators_with_namedtuple_return = {
-    'max', 'min', 'aminmax', 'median', 'nanmedian', 'mode', 'kthvalue', 'svd', 'symeig', 'eig',
+    'max', 'min', 'aminmax', 'median', 'nanmedian', 'mode', 'kthvalue', 'svd', 'symeig',
     'qr', 'geqrf', 'slogdet', 'sort', 'topk', 'lstsq', 'linalg_inv_ex',
     'triangular_solve', 'cummax', 'cummin', 'linalg_eigh', "_linalg_eigh", "_unpack_dual", 'linalg_qr',
     'linalg_svd', '_linalg_svd', 'linalg_slogdet', '_linalg_slogdet', 'fake_quantize_per_tensor_affine_cachemask',
@@ -77,7 +77,7 @@ def test_namedtuple_return(self):
             op(operators=['_linalg_slogdet'], input=(), names=('sign', 'logabsdet', 'LU', 'pivots'), hasout=True),
             op(operators=['qr', 'linalg_qr'], input=(), names=('Q', 'R'), hasout=True),
             op(operators=['geqrf'], input=(), names=('a', 'tau'), hasout=True),
-            op(operators=['symeig', 'eig'], input=(True,), names=('eigenvalues', 'eigenvectors'), hasout=True),
+            op(operators=['symeig'], input=(True,), names=('eigenvalues', 'eigenvectors'), hasout=True),
             op(operators=['triangular_solve'], input=(a,), names=('solution', 'cloned_coefficient'), hasout=True),
             op(operators=['lstsq'], input=(a,), names=('solution', 'QR'), hasout=True),
             op(operators=['linalg_eig'], input=(), names=('eigenvalues', 'eigenvectors'), hasout=True),
diff --git a/test/test_native_mha.py b/test/test_native_mha.py
index 2d0843b5a4d3..912475757895 100644
--- a/test/test_native_mha.py
+++ b/test/test_native_mha.py
@@ -213,7 +213,7 @@ def forward(self, q, k, v, key_padding_mask):
             q, k, v, key_padding_mask=mask if use_padding and not use_nt else None
         )
         if use_nt:
-            ynpt = ynpt.to_padded_tensor(0)
+            ynpt = torch.nested.to_padded_tensor(ynpt, 0)
             if pad_all:
                 ynpt_final = torch.zeros_like(ypt)
                 ynpt_final[:, :ynpt.shape[1], :] = ynpt
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
index 0287959d7b24..14ee1088ba20 100644
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@@ -263,8 +263,15 @@ def test_activations(self):
     def test_to_padded_tensor_on_empty_tensor(self):
 
         nt = torch.nested_tensor([])
-        empty = nt.to_padded_tensor(4)
+        empty = torch.nested.to_padded_tensor(nt, 4)
         self.assertEqual(empty, torch.tensor([]))
+
+    def test_nested_namespace(self):
+        nt = torch.nested_tensor([torch.randn(2, 3), torch.randn(4, 5)])
+        result = torch.nested.to_padded_tensor(nt, 4)
+        nested_namespace_result = torch.nested.to_padded_tensor(nt, 4)
+        self.assertEqual(result, nested_namespace_result)
+
 class TestNestedTensorDeviceType(TestCase):
 
     # Helper function to generate a random nested tensor
@@ -315,7 +322,7 @@ def test_detach(self, device, dtype):
         self.assertIsNone(y.grad_fn)
 
         z = x + y
-        z.to_padded_tensor(0).sum().backward()
+        torch.nested.to_padded_tensor(z, 0).sum().backward()
         # This is an incorrect gradient, but we assume that's what the user
         # wanted. detach() is an advanced option.
         self.assertEqual(a.grad, torch.ones(2, 4, device=device, dtype=dtype))
@@ -339,7 +346,7 @@ def test_to_then_from_padded_tensor_no_transform0213(self, device, dtype):
         ts = list(torch.unbind(t))
         ts[0] = ts[0][:-1]
         nt = torch.nested_tensor(ts, device=device, dtype=dtype)
-        padded = nt.to_padded_tensor(0)
+        padded = torch.nested.to_padded_tensor(nt, 0)
 
         nt_to = torch._nested_from_padded_and_nested_example(padded, nt)
 
@@ -389,7 +396,7 @@ def test_to_padded_tensor_simple(self, device, dtype):
         ts[0] = ts[0][:-1]
         nt = torch.nested_tensor(ts, device=device, dtype=dtype)
         for padding_value in (0, 1):
-            padded = nt.to_padded_tensor(padding_value)
+            padded = torch.nested.to_padded_tensor(nt, padding_value)
 
             correct_output = t.clone()
             if padding_value == 0:
@@ -409,7 +416,7 @@ def test_to_padded_tensor_output_size(self, device, dtype):
         ts[0] = ts[0][:-1]
         nt = torch.nested_tensor(ts, device=device, dtype=dtype)
         for padding_value in (0, 1):
-            padded = nt.to_padded_tensor(padding_value, output_size=output_size)
+            padded = torch.nested.to_padded_tensor(nt, padding_value, output_size=output_size)
             correct_output = torch.ones(output_size, device=device, dtype=dtype) * padding_value
             correct_output[:4:, :4, :4] = t.clone()
             if padding_value == 0:
@@ -436,7 +443,7 @@ def test_to_padded_tensor_dim2(self, device, dtype):
             correct_output.append(next_output)
             next_output[:t.size(0)].copy_(t)
         correct_output = torch.stack(correct_output)
-        padded = nt.to_padded_tensor(pad)
+        padded = torch.nested.to_padded_tensor(nt, pad)
         self.assertEqual(padded, correct_output)
 
     @dtypes(torch.float, torch.float16, torch.double)
@@ -454,7 +461,7 @@ def test_to_padded_tensor_dim3(self, device, dtype):
             correct_output.append(next_output)
             next_output[:t.size(0), :t.size(1)].copy_(t)
         correct_output = torch.stack(correct_output)
-        padded = nt.to_padded_tensor(pad)
+        padded = torch.nested.to_padded_tensor(nt, pad)
         self.assertEqual(padded, correct_output)
 
     @dtypes(torch.float, torch.float16, torch.double)
@@ -472,7 +479,7 @@ def test_to_padded_tensor_dim4(self, device, dtype):
             correct_output.append(next_output)
             next_output[:t.size(0), :t.size(1), :t.size(2)].copy_(t)
         correct_output = torch.stack(correct_output)
-        padded = nt.to_padded_tensor(pad)
+        padded = torch.nested.to_padded_tensor(nt, pad)
         self.assertEqual(padded, correct_output)
 
     # TODO: test noncontiguous to_padded_tensor
@@ -485,13 +492,13 @@ def test_to_padded_tensor_noncontiguous(self, device, dtype):
         nt_contiguous, nt_noncontiguous = random_nt_noncontiguous_pair((2, 3, 6, 7), device, dtype)
         # test noncontiguous_to_padded_tensor functionality
         self.assertEqual(
-            nt_contiguous.to_padded_tensor(0.0),
+            torch.nested.to_padded_tensor(nt_contiguous, 0.0),
             noncontiguous_to_padded_tensor(nt_noncontiguous))
         # test to_padded_tensor error message
         self.assertRaisesRegex(
             RuntimeError,
             r"for now to_padded_tensor only supports contiguous nested tensor",
-            lambda: nt_noncontiguous.to_padded_tensor(0.0)
+            lambda: torch.nested.to_padded_tensor(nt_noncontiguous, 0.0)
         )
 
     @skipMeta
@@ -682,7 +689,6 @@ def test_sum(device, dtype, ntensors, max_sizes, dim, keepdim=True):
         with self.assertRaisesRegex(RuntimeError, "NestedTensor always requires keepdim=True for now."):
             torch.nested_tensor([torch.tensor([3, 4, 5]), torch.tensor([1, 2])]).sum(-1)
 
-
     @dtypes(torch.float, torch.float16)
     @skipMeta
     def test_clone(self, device, dtype):
@@ -790,11 +796,11 @@ def test_softmax(self, device, dtype):
         y0 = softmaxer(nt)
         y1 = torch.nn.functional.softmax(nt, 1)
         self.assertEqual(y0, y1)
-        pt = nt.to_padded_tensor(float("-inf"))
+        pt = torch.nested.to_padded_tensor(nt, float("-inf"))
         # if an entire slice is padded, then softmax will return 0.0 / 0.0 = nan
         # however, physically speaking that should be 0.0
         expect = torch.nn.functional.softmax(pt, 1).nan_to_num_(0.0)
-        self.assertEqual(y0.to_padded_tensor(0.0), expect)
+        self.assertEqual(torch.nested.to_padded_tensor(y0, 0.0), expect)
         # edge case: empty nested tensor
         nt0 = torch.nested_tensor([])
         y = torch.nn.functional.softmax(nt0, 1)
@@ -898,8 +904,8 @@ def test_bmm(self, device, dtype):
         # normal nested tensor
         nt0 = torch.nested_tensor([torch.randn((2, 4)), torch.randn((3, 7))], device=device, dtype=dtype)
         nt1 = torch.nested_tensor([torch.randn((4, 6)), torch.randn((7, 5))], device=device, dtype=dtype)
-        actual = nt0.bmm(nt1).to_padded_tensor(0.0)
-        expect = nt0.to_padded_tensor(0.0).bmm(nt1.to_padded_tensor(0.0))
+        actual = torch.nested.to_padded_tensor(nt0.bmm(nt1), 0.0)
+        expect = torch.nested.to_padded_tensor(nt0, 0.0).bmm(torch.nested.to_padded_tensor(nt1, 0.0))
         self.assertEqual(actual, expect)
 
     # cannot test torch.float16 because: RuntimeError: "addmm_impl_cpu_" not implemented for 'Half'
@@ -1018,8 +1024,8 @@ def test_matmul(self, device, dtype):
         # normal nested tensor: 3D
         nt0 = torch.nested_tensor([torch.randn((2, 4)), torch.randn((3, 7))], device=device, dtype=dtype)
         nt1 = torch.nested_tensor([torch.randn((4, 6)), torch.randn((7, 5))], device=device, dtype=dtype)
-        actual = torch.matmul(nt0, nt1).to_padded_tensor(0.0)
-        expect = torch.matmul(nt0.to_padded_tensor(0.0), nt1.to_padded_tensor(0.0))
+        actual = torch.nested.to_padded_tensor(torch.matmul(nt0, nt1), 0.0)
+        expect = torch.matmul(torch.nested.to_padded_tensor(nt0, 0.0), torch.nested.to_padded_tensor(nt1, 0.0))
         self.assertEqual(actual, expect)
         # normal nested tensor: 4D
         nt0 = torch.nested_tensor([torch.randn((8, 2, 4)),
@@ -1028,8 +1034,8 @@ def test_matmul(self, device, dtype):
         nt1 = torch.nested_tensor([torch.randn((8, 4, 6)),
                                    torch.randn((8, 7, 5))],
                                   device=device, dtype=dtype)
-        actual = torch.matmul(nt0, nt1).to_padded_tensor(0.0)
-        expect = torch.matmul(nt0.to_padded_tensor(0.0), nt1.to_padded_tensor(0.0))
+        actual = torch.nested.to_padded_tensor(torch.matmul(nt0, nt1), 0.0)
+        expect = torch.matmul(torch.nested.to_padded_tensor(nt0, 0.0), torch.nested.to_padded_tensor(nt1, 0.0))
         self.assertEqual(actual, expect)
         # normal nested tensor: 5D
         nt0 = torch.nested_tensor([torch.randn((8, 9, 2, 4)),
@@ -1038,8 +1044,8 @@ def test_matmul(self, device, dtype):
         nt1 = torch.nested_tensor([torch.randn((8, 9, 4, 6)),
                                    torch.randn((8, 9, 7, 5))],
                                   device=device, dtype=dtype)
-        actual = torch.matmul(nt0, nt1).to_padded_tensor(0.0)
-        expect = torch.matmul(nt0.to_padded_tensor(0.0), nt1.to_padded_tensor(0.0))
+        actual = torch.nested.to_padded_tensor(torch.matmul(nt0, nt1), 0.0)
+        expect = torch.matmul(torch.nested.to_padded_tensor(nt0, 0.0), torch.nested.to_padded_tensor(nt1, 0.0))
         self.assertEqual(actual, expect)
 
     # cannot test torch.float16 because: RuntimeError: "bmm" not implemented for 'Half'
@@ -1129,7 +1135,7 @@ def test_transpose(self, device, dtype):
         # normal case
         ntT = nt.transpose(-1, -2)
         ptT_from_ntT = noncontiguous_to_padded_tensor(ntT)
-        pt = nt.to_padded_tensor(0.0)
+        pt = torch.nested.to_padded_tensor(nt, 0.0)
         ptT = pt.transpose(-1, -2)
         self.assertEqual(ptT, ptT_from_ntT)
 
@@ -1140,7 +1146,7 @@ def test_transpose_inference_mode_interaction(self, device, dtype):
         with torch.inference_mode():
             ntT = nt.transpose(-1, -2)
             ptT_from_ntT = noncontiguous_to_padded_tensor(ntT)
-            pt = nt.to_padded_tensor(0.0)
+            pt = torch.nested.to_padded_tensor(nt, 0.0)
             ptT = pt.transpose(-1, -2)
             self.assertEqual(ptT, ptT_from_ntT)
 
@@ -1149,7 +1155,7 @@ def test_transpose_inference_mode_interaction(self, device, dtype):
             nt = self.random_nt(device, dtype, 4, (4, 4))
             ntT = nt.transpose(-1, -2)
             ptT_from_ntT = noncontiguous_to_padded_tensor(ntT)
-            pt = nt.to_padded_tensor(0.0)
+            pt = torch.nested.to_padded_tensor(nt, 0.0)
             ptT = pt.transpose(-1, -2)
             self.assertEqual(ptT, ptT_from_ntT)
 
@@ -1184,7 +1190,7 @@ def test_view(self, device, dtype):
         x0 = torch.randn((2, 20), device=device, dtype=dtype)
         x1 = torch.randn((3, 20), device=device, dtype=dtype)
         nt = torch.nested_tensor([x0, x1])
-        pt = nt.to_padded_tensor(0.0)
+        pt = torch.nested.to_padded_tensor(nt, 0.0)
         self.assertRaisesRegex(
             RuntimeError,
             r"for now view cannot change the implicit batch dimension",
@@ -1209,7 +1215,7 @@ def test_view_inference_mode_interaction(self, device, dtype):
         with torch.inference_mode():
             ntT = nt.view(2, -1, 4, 5)
             ptT_from_ntT = noncontiguous_to_padded_tensor(ntT)
-            pt = nt.to_padded_tensor(0.0)
+            pt = torch.nested.to_padded_tensor(nt, 0.0)
             ptT = pt.view(2, -1, 4, 5)
             self.assertEqual(ptT, ptT_from_ntT)
         # Construct and view while in inference mode
@@ -1217,7 +1223,7 @@ def test_view_inference_mode_interaction(self, device, dtype):
             nt = torch.nested_tensor([torch.randn((2, 20)), torch.randn((3, 20))], device=device, dtype=dtype)
             ntT = nt.view(2, -1, 4, 5)
             ptT_from_ntT = noncontiguous_to_padded_tensor(ntT)
-            pt = nt.to_padded_tensor(0.0)
+            pt = torch.nested.to_padded_tensor(nt, 0.0)
             ptT = pt.view(2, -1, 4, 5)
             self.assertEqual(ptT, ptT_from_ntT)
 
@@ -1252,7 +1258,7 @@ def test_reshape(self, device, dtype):
         x0 = torch.randn((2, 20), device=device, dtype=dtype)
         x1 = torch.randn((3, 20), device=device, dtype=dtype)
         nt = torch.nested_tensor([x0, x1])
-        pt = nt.to_padded_tensor(0.0)
+        pt = torch.nested.to_padded_tensor(nt, 0.0)
         self.assertRaisesRegex(
             RuntimeError,
             r"for now reshape cannot change the implicit batch dimension",
@@ -1375,7 +1381,7 @@ def test_nested_tensor_to_padded_tensor(self):
             nt = torch.nested_tensor([torch.randn(1, 2), torch.randn(7, 8)])
             nt.requires_grad_()
 
-            out = nt.to_padded_tensor(padding_val)
+            out = torch.nested.to_padded_tensor(nt, padding_val)
             grad_output = torch.ones(out.shape)
             out.backward(grad_output)
 
@@ -1396,7 +1402,7 @@ def test_nested_tensor_from_mask_and_to_padded(self):
         def grad_test_func(inpt):
             nt = torch._nested_tensor_from_mask(inpt, mask)
             # This implicitly tests to_padded_tensor grads
-            return nt.to_padded_tensor(0)
+            return torch.nested.to_padded_tensor(nt, 0)
         assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
 
     def test_nested_tensor_from_padded(self):
@@ -1408,7 +1414,7 @@ def test_nested_tensor_from_padded(self):
         def grad_test_func(tensor, nested_size):
             nt = torch._nested_from_padded(tensor, nested_size, fuse_transform_0213=False)
             # This implicitly tests to_padded_tensor grads
-            return nt.to_padded_tensor(0)
+            return torch.nested.to_padded_tensor(nt, 0)
 
         data = (padded_tensor, nested_size)
         assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
@@ -1422,7 +1428,7 @@ def test_nested_tensor_from_padded_fused(self):
         def grad_test_func(tensor, nested_size):
             nt = torch._nested_from_padded(tensor, nested_size, fuse_transform_0213=True)
             # This implicitly tests to_padded_tensor grads
-            return nt.to_padded_tensor(0)
+            return torch.nested.to_padded_tensor(nt, 0)
         data = (padded_tensor, nested_size)
         assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
 
@@ -1435,7 +1441,7 @@ def test_nested_tensor_from_list(self):
         def grad_test_func(a, b, c):
             c = torch.nested_tensor([a, b, c])
             # This implictily tests to_padded_tensor grads
-            return c.to_padded_tensor(0)
+            return torch.nested.to_padded_tensor(c, 0)
         data = (a, b, c)
         assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
 
@@ -1480,7 +1486,7 @@ def grad_test_func(a, b, c, d):
             nt0 = torch.nested_tensor([a, b])
             nt1 = torch.nested_tensor([c, d])
             result = nt0.bmm(nt1)
-            return result.to_padded_tensor(0.0)
+            return torch.nested.to_padded_tensor(result, 0.0)
 
         data = (a, b, c, d)
         assert torch.autograd.gradcheck(grad_test_func, inputs=data)
@@ -1489,16 +1495,16 @@ def test_nested_tensor_bmm_backward(self):
         nt0 = torch.nested_tensor([torch.randn((2, 6)), torch.randn((3, 6))]).requires_grad_(True)
         nt1 = torch.nested_tensor([torch.randn((6, 4)), torch.randn((6, 5))]).requires_grad_(True)
         with torch.no_grad():
-            pt0 = nt0.to_padded_tensor(0.0).requires_grad_(True)
-            pt1 = nt1.to_padded_tensor(0.0).requires_grad_(True)
+            pt0 = torch.nested.to_padded_tensor(nt0, 0.0).requires_grad_(True)
+            pt1 = torch.nested.to_padded_tensor(nt1, 0.0).requires_grad_(True)
 
         ynt = nt0.bmm(nt1)
         ypt = pt0.bmm(pt1)
         ynt.backward(ynt.clone())
         ypt.backward(ypt.clone())
 
-        self.assertEqual(nt0.grad.to_padded_tensor(0.0), pt0.grad)
-        self.assertEqual(nt1.grad.to_padded_tensor(0.0), pt1.grad)
+        self.assertEqual(torch.nested.to_padded_tensor(nt0.grad, 0.0), pt0.grad)
+        self.assertEqual(torch.nested.to_padded_tensor(nt1.grad, 0.0), pt1.grad)
 
     def test_nested_tensor_matmul_gradcheck(self):
         a = torch.randn(2, 6, requires_grad=True, dtype=torch.float64)
@@ -1510,7 +1516,7 @@ def grad_test_func(a, b, c, d):
             nt0 = torch.nested_tensor([a, b])
             nt1 = torch.nested_tensor([c, d])
             result = torch.matmul(nt0, nt1)
-            return result.to_padded_tensor(0.0)
+            return torch.nested.to_padded_tensor(result, 0.0)
 
         data = (a, b, c, d)
         assert torch.autograd.gradcheck(grad_test_func, inputs=data)
@@ -1519,16 +1525,16 @@ def test_nested_tensor_matmul_backward(self):
         nt0 = torch.nested_tensor([torch.randn((7, 2, 6)), torch.randn((7, 3, 6))]).requires_grad_(True)
         nt1 = torch.nested_tensor([torch.randn((7, 6, 4)), torch.randn((7, 6, 5))]).requires_grad_(True)
         with torch.no_grad():
-            pt0 = nt0.to_padded_tensor(0.0).requires_grad_(True)
-            pt1 = nt1.to_padded_tensor(0.0).requires_grad_(True)
+            pt0 = torch.nested.to_padded_tensor(nt0, 0.0).requires_grad_(True)
+            pt1 = torch.nested.to_padded_tensor(nt1, 0.0).requires_grad_(True)
 
         ynt = torch.matmul(nt0, nt1)
         ypt = torch.matmul(pt0, pt1)
         ynt.backward(ynt.clone())
         ypt.backward(ypt.clone())
 
-        self.assertEqual(nt0.grad.to_padded_tensor(0.0), pt0.grad)
-        self.assertEqual(nt1.grad.to_padded_tensor(0.0), pt1.grad)
+        self.assertEqual(torch.nested.to_padded_tensor(nt0.grad, 0.0), pt0.grad)
+        self.assertEqual(torch.nested.to_padded_tensor(nt1.grad, 0.0), pt1.grad)
 
     def test_nested_tensor_transpose_gradcheck(self):
         a = torch.randn(2, 5, requires_grad=True)
@@ -1537,7 +1543,7 @@ def test_nested_tensor_transpose_gradcheck(self):
         def grad_test_func(a, b):
             nt = torch.nested_tensor([a, b])
             result = nt.transpose(-2, -1).transpose(-2, -1)
-            return result.to_padded_tensor(0.0)
+            return torch.nested.to_padded_tensor(result, 0.0)
 
         data = (a, b)
         assert torch.autograd.gradcheck(grad_test_func, inputs=data, eps=1e-3)
@@ -1545,14 +1551,14 @@ def grad_test_func(a, b):
     def test_nested_tensor_transpose_backward(self):
         nt = torch.nested_tensor([torch.randn((2, 5)), torch.randn((3, 4))]).requires_grad_(True)
         with torch.no_grad():
-            pt = nt.to_padded_tensor(0.0).requires_grad_(True)
+            pt = torch.nested.to_padded_tensor(nt, 0.0).requires_grad_(True)
 
         ynt = nt.transpose(-2, -1)
         ypt = pt.transpose(-2, -1)
         ynt.backward(ynt.clone())
         ypt.backward(ypt.clone())
 
-        self.assertEqual(nt.grad.to_padded_tensor(0.0), pt.grad)
+        self.assertEqual(torch.nested.to_padded_tensor(nt.grad, 0.0), pt.grad)
 
     def test_nested_tensor_reshape_gradcheck(self):
         a = torch.randn(2, 6, requires_grad=True)
@@ -1561,7 +1567,7 @@ def test_nested_tensor_reshape_gradcheck(self):
         def grad_test_func(a, b):
             nt = torch.nested_tensor([a, b])
             result = nt.reshape(2, -1, 2, 3)
-            return result.to_padded_tensor(0.0)
+            return torch.nested.to_padded_tensor(result, 0.0)
 
         data = (a, b)
         assert torch.autograd.gradcheck(grad_test_func, inputs=data, eps=1e-3)
@@ -1569,14 +1575,14 @@ def grad_test_func(a, b):
     def test_nested_tensor_reshape_backward(self):
         nt = torch.nested_tensor([torch.randn((2, 6)), torch.randn((3, 6))]).requires_grad_(True)
         with torch.no_grad():
-            pt = nt.to_padded_tensor(0.0).requires_grad_(True)
+            pt = torch.nested.to_padded_tensor(nt, 0.0).requires_grad_(True)
 
         ynt = nt.reshape(2, -1, 2, 3)
         ypt = pt.reshape(2, -1, 2, 3)
         ynt.backward(ynt.clone())
         ypt.backward(ypt.clone())
 
-        self.assertEqual(nt.grad.to_padded_tensor(0.0), pt.grad)
+        self.assertEqual(torch.nested.to_padded_tensor(nt.grad, 0.0), pt.grad)
 
     def test_nested_tensor_linear(self):
 
@@ -1591,7 +1597,7 @@ def grad_test_func(a, b, c, weight, bias=None):
             nt = torch.nested_tensor([a, b, c])
             # This implicitly tests to_padded_tensor grads
             d = torch.functional.F.linear(nt, weight, bias)
-            return d.to_padded_tensor(0)
+            return torch.nested.to_padded_tensor(d, 0)
         data = (a, b, c, weight, bias)
         assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
 
@@ -1608,7 +1614,7 @@ def grad_test_func(a, b, c, dim):
             nt = torch.nested_tensor([a, b, c])
             # This implicitly tests to_padded_tensor grads
             d = torch.functional.F.softmax(nt, dim=dim)
-            return d.to_padded_tensor(0)
+            return torch.nested.to_padded_tensor(d, 0)
 
         # softmax over last dim
         data = (a, b, c, -1)
@@ -1634,7 +1640,48 @@ def test_nested_tensor_linear_backward(self):
         assert b.grad is None
         assert c.grad is None
 
+    def test_values_grad_with_broadcast(self):
+        a = torch.randn(1, 2, 4, requires_grad=True, dtype=torch.float64)
+        b = torch.randn(2, 2, 4, requires_grad=True, dtype=torch.float64)
+        c = torch.randn(3, 2, 4, requires_grad=True, dtype=torch.float64)
+
+        def grad_test_func(a, b, c):
+            nt = torch.nested_tensor([a, b, c])
+            buffer = nt.values()
+            return buffer.sum()
+
+        data = (a, b, c)
+        assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
+
+    def test_to_buffer_series_ops_grad_with_broadcast(self):
+        a = torch.randn(1, 1, 2, requires_grad=True, dtype=torch.float64)
+        b = torch.randn(1, 1, 2, requires_grad=True, dtype=torch.float64)
+        c = torch.randn(1, 1, 2, requires_grad=True, dtype=torch.float64)
 
+        def grad_test_func(a, b, c):
+            nt = torch.nested_tensor([a, b, c])
+            buffer = nt.values()
+            buffer = buffer * 2
+            return buffer.exp()
+
+        data = (a, b, c)
+        assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
+
+    def test_unbind_flow_through(self):
+        a = torch.randn(1, 2, 4, requires_grad=True, dtype=torch.float64)
+        b = torch.randn(2, 2, 4, requires_grad=True, dtype=torch.float64)
+        c = torch.randn(3, 2, 4, requires_grad=True, dtype=torch.float64)
+
+        def grad_test_func(a, b, c):
+            nt = torch.nested_tensor([a, b, c])
+            ntT = nt.transpose(-1, -2)
+            unbound = ntT.unbind()
+            d = unbound[0]
+            d = torch.pow(d, 2)
+            return d
+
+        data = (a, b, c)
+        assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
 
 instantiate_device_type_tests(TestNestedTensorDeviceType, globals())
 
diff --git a/test/test_nn.py b/test/test_nn.py
index c6fd6e7181b5..7a3800ce6c99 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -5370,7 +5370,7 @@ def test_nested_tensor_from_mask(self):
             mask[i, end:] = False
 
         nt = torch._nested_tensor_from_mask(input, mask)
-        input_convert = nt.to_padded_tensor(0.)
+        input_convert = torch.nested.to_padded_tensor(nt, 0.)
         input.masked_fill_(mask.reshape(N, L, 1).logical_not(), 0.)
 
         self.assertEqual(input, input_convert)
@@ -9019,7 +9019,6 @@ def test_batchnorm_non_contig_cpu(self):
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
-    @skipIfRocm
     def test_batchnorm_cudnn_nhwc(self):
         def run_test(input, grad_output):
             c = input.size(1)
@@ -15906,7 +15905,6 @@ def _test_gumbel_softmax_grad(self, device, dtype):
     @skipIfMps
     @dtypesIfCUDA(torch.half, torch.float, torch.double)
     @dtypes(torch.float, torch.double)
-    @skipIfTorchDynamo("requires https://github.com/pytorch/torchdynamo/pull/1098")
     def test_gumbel_softmax(self, device, dtype):
         self._test_gumbel_softmax_st_shapes(device, dtype, shape=[5], dim=0, count_expected=1)
         self._test_gumbel_softmax_st_shapes(device, dtype, shape=[5], dim=-1, count_expected=1)
@@ -18750,7 +18748,7 @@ def perm_fn(x):
                     ],
                     device=device, dtype=dtype
                 )
-                result = result.to_padded_tensor(0)
+                result = torch.nested.to_padded_tensor(result, 0)
                 ref_output[0][-1] = torch.zeros_like(
                     ref_output[0][-1], device=device, dtype=dtype
                 )
diff --git a/test/test_nvfuser_frontend.py b/test/test_nvfuser_frontend.py
new file mode 100644
index 000000000000..28c5894a002c
--- /dev/null
+++ b/test/test_nvfuser_frontend.py
@@ -0,0 +1,348 @@
+# Owner(s): ["module: nvfuser"]
+
+import unittest
+from typing import List
+
+import torch
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_ROCM, TestCase
+from torch.testing._internal.jit_utils import RUN_CUDA
+import torch._refs as refs
+import torch._prims as prims
+
+# Will only create the _nvfuser module if CUDA is available
+if hasattr(torch._C, "_nvfuser"):
+    from torch._C._nvfuser import Fusion, FusionCache, FusionDefinition, DataType
+
+RUN_NVFUSER = RUN_CUDA and not TEST_WITH_ROCM
+
+def is_pre_volta():
+    if not RUN_NVFUSER:
+        return False
+    prop = torch.cuda.get_device_properties(torch.cuda.current_device())
+    return prop.major < 7
+
+@unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+@unittest.skipIf(is_pre_volta(), "Only supported on Volta and newer devices.")
+class TestNvFuserFrontend(TestCase):
+    def test_basic(self) :
+        input1 = torch.ones(2, 4, 8, device='cuda')
+        input2 = torch.ones(2, 4, 8, device='cuda')
+        fc = FusionCache.get()
+        before_fusions = fc.num_fusions()
+
+        fs1 = Fusion()
+        with FusionDefinition(fs1) as fd :
+            t0 = fd.define_tensor(3)
+            t1 = fd.define_tensor(3)
+            c0 = fd.define_constant(3.0)
+
+            t2 = fd.ops.add(t0, t1)
+            t3 = fd.ops.mul(t2, c0)
+            t4 = fd.ops.sum(t3, [-1], False, DataType.Float)
+
+            fd.add_output(t4)
+
+        # Expected Output is a tensor of 48's
+        nvf_out1 = fs1.execute([input1, input2])[0]
+
+        # Create a new fusion with the same definition, it should hit the cache!
+        fs2 = Fusion()
+        with FusionDefinition(fs2) as fd :
+            t0 = fd.define_tensor(3)
+            t1 = fd.define_tensor(3)
+            c0 = fd.define_constant(3.0)
+
+            t2 = fd.ops.add(t0, t1)
+            t3 = fd.ops.mul(t2, c0)
+            t4 = fd.ops.sum(t3, [-1], False, DataType.Float)
+
+            fd.add_output(t4)
+
+        nvf_out2 = fs2.execute([input1, input2])[0]
+
+        # Check there is still only 1 cache entry
+        fc = FusionCache.get()
+        self.assertEqual(fc.num_fusions() - before_fusions, 1)
+
+        # Create a fusion from a fusion id and make sure it executes!
+        fs3 = Fusion(fs2.id())
+        nvf_out3 = fs3.execute([input1, input2])[0]
+
+        eager_out = torch.sum((input1 + input2) * 3.0, dim=-1)
+        self.assertEqual(eager_out, nvf_out1)
+        self.assertEqual(eager_out, nvf_out2)
+        self.assertEqual(eager_out, nvf_out3)
+
+    def test_basic_fp16(self) :
+        fs = Fusion()
+        with FusionDefinition(fs) as fd :
+            t0 = fd.define_tensor(3, DataType.Half)
+            t1 = fd.define_tensor(3, DataType.Half)
+            c0 = fd.define_constant(3.0)
+
+            t2 = fd.ops.add(t0, t1)
+            t3 = fd.ops.mul(t2, c0)
+            t4 = fd.ops.sum(t3, [-1], False, DataType.Float)
+
+            t5 = fd.ops.cast(t4, DataType.Half)
+            fd.add_output(t5)
+
+        input1 = torch.ones(2, 4, 8, device='cuda', dtype=torch.float16)
+        input2 = torch.ones(2, 4, 8, device='cuda', dtype=torch.float16)
+
+        # Expected Output is a tensor of 48's
+        nvf_out = fs.execute([input1, input2])[0]
+        eager_out = torch.sum((input1 + input2) * 3.0, dim=-1)
+        self.assertEqual(eager_out, nvf_out)
+
+    def test_cast_double_to_half(self) :
+        fs = Fusion()
+        with FusionDefinition(fs) as fd :
+            t0 = fd.define_tensor(2, DataType.Double)
+            t1 = fd.define_tensor(2, DataType.Double)
+
+            t0h = fd.ops.cast(t0, DataType.Half)
+            t1h = fd.ops.cast(t1, DataType.Half)
+            t2 = fd.ops.add(t0h, t1h)
+            t3 = fd.ops.relu(t2)
+            t4 = fd.ops.cast(t3, DataType.Half)
+
+            fd.add_output(t4)
+
+        input1 = torch.randn(2, 4, device='cuda', dtype=torch.float64)
+        input2 = torch.randn(2, 4, device='cuda', dtype=torch.float64)
+
+        nvf_out = fs.execute([input1, input2])[0]
+        eager_out = torch.relu(input1.to(torch.half) + input2.to(torch.half))
+        self.assertEqual(eager_out, nvf_out)
+
+    def test_promote_to_double(self) :
+        fs = Fusion()
+
+        with FusionDefinition(fs) as fd :
+            t0 = fd.define_tensor(2, DataType.Half)
+            t1 = fd.define_tensor(2, DataType.Double)
+
+            t2 = fd.ops.add(t0, t1)
+            t5 = fd.ops.relu(t2)
+
+            fd.add_output(t5)
+
+        input1 = torch.randn(2, 4, device='cuda', dtype=torch.float16)
+        input2 = torch.randn(2, 4, device='cuda', dtype=torch.float64)
+
+        nvf_out = fs.execute([input1, input2])[0]
+        eager_out = torch.relu(input1 + input2)
+        self.assertEqual(eager_out, nvf_out)
+
+    def test_implicit_broadcast_input(self) :
+        fs = Fusion()
+        with FusionDefinition(fs) as fd :
+            t0 = fd.define_tensor(1)
+            t1 = fd.define_tensor(3)
+
+            t0_b = fd.ops.broadcast_in_dim(t0, [2, 3, 4], [1])
+            t2 = fd.ops.add(t0_b, t1)
+
+            fd.add_output(t2)
+
+        input1 = torch.randn(3, device='cuda')
+        input2 = torch.randn(2, 3, 4, device='cuda')
+
+        nvf_out = fs.execute([input1, input2])[0]
+        eager_out = refs.add(prims.broadcast_in_dim(input1, [2, 3, 4], [1]), input2)
+        self.assertEqual(eager_out, nvf_out)
+
+    def test_explicit_broadcast_input(self) :
+        input1 = torch.randn(1, 1, 4, device='cuda')
+        input2 = torch.randn(2, 3, 4, device='cuda')
+
+        fs = Fusion()
+        with FusionDefinition(fs) as fd :
+            t0 = fd.define_tensor(sizes=input1.size(), strides=input1.stride())
+            t1 = fd.define_tensor(sizes=input2.size(), strides=input2.stride())
+
+            t0_b = fd.ops.broadcast_in_dim(t0, [2, 3, 4], [0, 1, 2])
+            t2 = fd.ops.add(t0_b, t1)
+
+            fd.add_output(t2)
+
+        nvf_out = fs.execute([input1, input2])[0]
+        eager_out = refs.add(prims.broadcast_in_dim(input1, [2, 3, 4], [0, 1, 2]), input2)
+        self.assertEqual(eager_out, nvf_out)
+
+    def test_broadcast_mixing(self) :
+        fs = Fusion()
+        with FusionDefinition(fs) as fd :
+            t0 = fd.define_tensor([3, 1], [1, 1])
+            t1 = fd.define_tensor(1)
+
+            t1_b = fd.ops.broadcast_in_dim(t1, [3, 3], [0])
+            t2 = fd.ops.add(t0, t1_b)
+
+            fd.add_output(t2)
+
+        input1 = torch.randn(3, 1, device='cuda')
+        input2 = torch.randn(3, device='cuda')
+
+        nvf_out = fs.execute([input1, input2])[0]
+        eager_out = refs.add(input1, prims.broadcast_in_dim(input2, [3, 3], [0]))
+        self.assertEqual(eager_out, nvf_out)
+
+    def test_prim_layer_norm_fwd(self) :
+        def primitive_definition(
+            inputs: torch.Tensor,
+            weight: torch.Tensor,
+            bias: torch.Tensor,
+            normalization_axis: int,
+            keepdim: bool,
+        ) -> torch.Tensor:
+            mean = inputs.mean(normalization_axis, keepdim=keepdim)
+            diff = inputs - mean
+            diff_sq = diff * diff
+            var = diff_sq.mean(normalization_axis, keepdim=keepdim)
+            pre_shift_scale_norm_output = (inputs - mean) / torch.sqrt(var + 1e-12)
+            norm_output = weight * pre_shift_scale_norm_output + bias
+            return norm_output
+
+        def nvfuser_fusion(
+            fd: FusionDefinition,
+            normalization_axis: int,
+            norm_size: int,
+            input_shape: List[int],
+            eps: float,
+            keepDim: bool
+        ) -> None :
+            inputs = fd.define_tensor(symbolic_sizes=[-1, -1, -1], contiguous=[True, True, True], dtype=DataType.Float)
+            weights = fd.define_tensor(symbolic_sizes=[-1], contiguous=[True], dtype=DataType.Float)
+            bias = fd.define_tensor(symbolic_sizes=[-1], contiguous=[True], dtype=DataType.Float)
+            sum0 = fd.ops.sum(inputs, axes=[normalization_axis], keepdim=keepDim)
+            norm_const = fd.define_constant(norm_size)
+            mean = fd.ops.div(sum0, norm_const)
+            diff = fd.ops.sub(inputs, mean)
+            diff_sq = fd.ops.mul(diff, diff)
+            sum1 = fd.ops.sum(diff_sq, axes=[normalization_axis], keepdim=keepDim)
+            var = fd.ops.div(sum1, norm_const)
+            eps_const = fd.define_constant(eps)
+            var_eps = fd.ops.add(var, eps_const)
+            invstd = fd.ops.rsqrt(var_eps)
+            pre_scale_bias = fd.ops.mul(diff, invstd)
+            weights_bcast = fd.ops.broadcast_in_dim(weights, output_shape=input_shape, broadcast_dims=[2])
+            scale = fd.ops.mul(pre_scale_bias, weights_bcast)
+            bias_bcast = fd.ops.broadcast_in_dim(bias, output_shape=input_shape, broadcast_dims=[2])
+            out = fd.ops.add(scale, bias_bcast)
+            fd.add_output(out)
+            fd.add_output(mean)
+            fd.add_output(invstd)
+
+        def nvfuser_fusion_var_mean(
+            fd: FusionDefinition,
+            normalization_axis: int,
+            norm_size: int,
+            input_shape: List[int],
+            eps: float,
+            keepDim: bool
+        ) -> None :
+            inputs = fd.define_tensor(symbolic_sizes=[-1, -1, -1], contiguous=[True, True, True], dtype=DataType.Float)
+            weights = fd.define_tensor(symbolic_sizes=[-1], contiguous=[True], dtype=DataType.Float)
+            bias = fd.define_tensor(symbolic_sizes=[-1], contiguous=[True], dtype=DataType.Float)
+            var, mean = fd.ops.var_mean(inputs, axes=[normalization_axis], correction=0, keepdim=keepDim)
+            eps_const = fd.define_constant(eps)
+            var_eps = fd.ops.add(var, eps_const)
+            invstd = fd.ops.rsqrt(var_eps)
+            diff = fd.ops.sub(inputs, mean)
+            pre_scale_bias = fd.ops.mul(diff, invstd)
+            weights_bcast = fd.ops.broadcast_in_dim(weights, output_shape=input_shape, broadcast_dims=[2])
+            scale = fd.ops.mul(pre_scale_bias, weights_bcast)
+            bias_bcast = fd.ops.broadcast_in_dim(bias, output_shape=input_shape, broadcast_dims=[2])
+            out = fd.ops.add(scale, bias_bcast)
+            fd.add_output(out)
+            fd.add_output(mean)
+            fd.add_output(invstd)
+
+        input_size = [64, 128, 1024]
+        dtype = torch.float32
+        device = 'cuda'
+        inputs = torch.randn(*input_size, device=device, requires_grad=True)
+        weights = torch.nn.Parameter(torch.randn(input_size[2], dtype=dtype, device=device))
+        biases = torch.nn.Parameter(torch.randn(input_size[2], dtype=dtype, device=device))
+        fc = FusionCache.get()
+        before_fusions = fc.num_fusions()
+
+        for _ in range(5) :
+            nvf_fusion = Fusion()
+            with FusionDefinition(nvf_fusion) as fd:
+                nvfuser_fusion(fd, 2, inputs.size()[2], inputs.size(), 1e-12, True)
+            nvf_out = nvf_fusion.execute([inputs, weights, biases])
+
+        for _ in range(5) :
+            nvf_var_mean_fusion = Fusion()
+            with FusionDefinition(nvf_var_mean_fusion) as fd:
+                nvfuser_fusion_var_mean(fd, 2, inputs.size()[2], inputs.size(), 1e-12, True)
+            nvf_var_mean_out = nvf_var_mean_fusion.execute([inputs, weights, biases])
+
+        for _ in range(5) :
+            eager_out = primitive_definition(inputs, weights, biases, 2, True)
+
+        self.assertEqual(eager_out, nvf_out[0])
+        self.assertEqual(eager_out, nvf_var_mean_out[0])
+        fusion_cache = FusionCache.get()
+        self.assertEqual(fc.num_fusions() - before_fusions, 2)
+
+    def test_prim_rms_norm_fwd(self) :
+        def primitive_definition(
+            inputs: torch.Tensor,
+            weight: torch.Tensor,
+            normalization_axis: int,
+            keepdim: bool,
+        ) -> torch.Tensor:
+            var = inputs.mul(inputs).mean(normalization_axis, keepdim)
+            pre_shift_scale_norm_output = inputs / torch.sqrt(var + 1e-12)
+            norm_output = weight * pre_shift_scale_norm_output
+            return norm_output
+
+        def nvfuser_fusion(
+            fd: FusionDefinition,
+            normalization_axis: int,
+            norm_size: int,
+            input_shape: List[int],
+            eps: float,
+            keepDim: bool
+        ) -> None :
+            inputs = fd.define_tensor(symbolic_sizes=[-1, -1, -1], contiguous=[True, True, True], dtype=DataType.Float)
+            weights = fd.define_tensor(symbolic_sizes=[-1], contiguous=[True], dtype=DataType.Float)
+            inputs_sq = fd.ops.mul(inputs, inputs)
+            sum0 = fd.ops.sum(inputs_sq, axes=[normalization_axis], keepdim=keepDim)
+            norm_const = fd.define_constant(norm_size)
+            var = fd.ops.div(sum0, norm_const)
+            eps_const = fd.define_constant(eps)
+            var_eps = fd.ops.add(var, eps_const)
+            invstd = fd.ops.rsqrt(var_eps)
+            pre_scale = fd.ops.mul(inputs, invstd)
+            weights_bcast = fd.ops.broadcast_in_dim(weights, output_shape=input_shape, broadcast_dims=[2])
+            out = fd.ops.mul(pre_scale, weights_bcast)
+            fd.add_output(out)
+            fd.add_output(invstd)
+
+        input_size = [64, 128, 1024]
+        dtype = torch.float32
+        device = 'cuda'
+        inputs = torch.randn(*input_size, device=device, requires_grad=True)
+        weights = torch.nn.Parameter(torch.randn(input_size[2], dtype=dtype, device=device))
+        fc = FusionCache.get()
+        before_fusions = fc.num_fusions()
+
+        for _ in range(5) :
+            nvf_fusion = Fusion()
+            with FusionDefinition(nvf_fusion) as fd:
+                nvfuser_fusion(fd, 2, inputs.size()[2], inputs.size(), 1e-12, True)
+            nvf_out = nvf_fusion.execute([inputs, weights])
+
+        for _ in range(5) :
+            eager_out = primitive_definition(inputs, weights, 2, True)
+
+        self.assertEqual(eager_out, nvf_out[0])
+        self.assertEqual(fc.num_fusions() - before_fusions, 1)
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_ops.py b/test/test_ops.py
index e34fc14abb60..1a4136583f7c 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -199,7 +199,17 @@ def _to_tensormeta(x):
                         self.assertTrue(isinstance(b, FakeTensor))
                         prims.utils.compare_tensor_meta(a, b)
 
-    def _ref_test_helper(self, ctx, device, dtype, op, skip_zero_numel=False, skip_zero_dim=False, skip_bfloat=False):
+    def _ref_test_helper(
+        self,
+        ctx,
+        device,
+        dtype,
+        op,
+        skip_zero_numel=False,
+        skip_zero_dim=False,
+        skip_bfloat=False,
+        skip_view_consistency=False,
+    ):
         # NOTE: this test works by comparing the reference
         ex = None
         for sample in op.reference_inputs(device, dtype, requires_grad=False):
@@ -234,8 +244,10 @@ def _ref_test_helper(self, ctx, device, dtype, op, skip_zero_numel=False, skip_z
             for a, b in zip(tree_flatten(ref_result)[0], tree_flatten(torch_result)[0]):
                 if isinstance(a, torch.Tensor) or isinstance(b, torch.Tensor):
                     prims.utils.compare_tensor_meta(a, b)
-                    if getattr(op, 'validate_view_consistency', True):
-                        self.assertEqual(a._is_view(), b._is_view())
+                    if getattr(op, 'validate_view_consistency', True) and not skip_view_consistency:
+                        msg = (f"The torch implementation {'returns' if b._is_view() else 'does not return'} "
+                               f"a view, while the reference {'does' if a._is_view() else 'does not'}")
+                        self.assertEqual(a._is_view(), b._is_view(), msg)
 
             # Computes the dtype the more precise computatino would occur in
             precise_dtype = torch.bool
@@ -379,6 +391,9 @@ def test_python_ref_executor(self, device, dtype, op, executor):
             skip_zero_numel=("nvfuser" in executor),  # nvfuser doesn't support zero-sized tensors
             skip_zero_dim=skip_zero_dim,
             skip_bfloat=("nvfuser" in executor),  # nvfuser doesn't support bfloat tensors for pre-11 cuda TK
+            # # nvfuser doesn't support view consistency
+            # https://github.com/pytorch/pytorch/issues/84863
+            skip_view_consistency=("nvfuser" in executor),
         )
 
     @skipMeta
@@ -1585,6 +1600,10 @@ class TestRefsOpsInfo(TestCase):
         '_refs.rfloordiv',
         '_refs.rtruediv',
         '_refs.rpow',
+        # These should be tested with their out-of-place counterparts
+        '_refs.index_add_',
+        '_refs.index_copy_',
+        '_refs.index_fill_',
     }
 
     not_in_decomp_table = {
@@ -1593,6 +1612,8 @@ class TestRefsOpsInfo(TestCase):
         '_refs.nn.functional.mse_loss',
         '_refs.var',
         '_refs.rsub',
+        # duplicated due to efficiency concerns of the ref vs the decomp
+        '_refs.index_add_',
         # these are not aten ops?
         '_refs.broadcast_shapes',
         '_refs.broadcast_tensors',
@@ -1640,17 +1661,14 @@ class TestRefsOpsInfo(TestCase):
         '_refs.linalg.svdvals',
         '_refs.unflatten',
         # ref implementation missing kwargs
-        '_refs.empty',  # missing "pin_memory"
-        '_refs.empty_like',  # missing "layout"
         '_refs.full',  # missing "layout"
         '_refs.full_like',  # missing "layout"
-        '_refs.ones',  # missing "layout"
         '_refs.ones_like',  # missing "layout"
         '_refs.round',  # missing "decimals"
         '_refs.scalar_tensor',  # missing "layout"
-        '_refs.zeros',  # missing "layout"
         '_refs.zeros_like',  # missing "layout"
         # other
+        '_refs.expand_as',
         '_refs.as_strided',  # _prims._as_strided_meta: "reduce() of empty sequence with no initial value"
         '_refs.copy_to',  # torch._C._jit_get_operation: No such operator aten::copy_to
         '_refs.clone',  # test_meta.py: view size is not compatible with input tensor's size and stride
@@ -1674,11 +1692,11 @@ def test_refs_are_in_decomp_table(self, op):
         op_impl = getattr(import_module(f"torch.{module_path}"), op_name)
 
         if op in self.not_in_decomp_table:
-            self.assertFalse(op_impl in torch._decomp.decomposition_table.values(),
+            self.assertNotIn(op_impl, torch._decomp.decomposition_table.values(),
                              f"Unexpectedly found {op} in torch._decomp.decomposition_table.values()")
         else:
-            self.assertTrue(op_impl in torch._decomp.decomposition_table.values(),
-                            f"Did not find {op} in torch._decomp.decomposition_table.values()")
+            self.assertIn(op_impl, torch._decomp.decomposition_table.values(),
+                          f"Did not find {op} in torch._decomp.decomposition_table.values()")
 
 
 fake_skips = (
@@ -1714,6 +1732,7 @@ def test_refs_are_in_decomp_table(self, op):
     "sparse.sampled.addmm",  # sparsity not supported
     # Can not infer total number of classes from meta. no way at present to throw DynamicOutputShapeException
     "nn.functional.one_hot",
+    "narrow",  # Fails only for one overload with DataDependentOutputException (hence skip).
 )
 
 fake_autocast_device_skips = defaultdict(dict)
diff --git a/test/test_prims.py b/test/test_prims.py
index cf2d721cd1fd..f818b440fe85 100644
--- a/test/test_prims.py
+++ b/test/test_prims.py
@@ -327,11 +327,44 @@ def func(a):
         # Check that all call_function nodes are prims
         call_function_nodes = list(filter(lambda n: n.op == "call_function", gm.graph.nodes))
         all_prims_namespace = all(
-            node.target.name.startswith("prims") for node in call_function_nodes
+            node.target.name().startswith("prims") for node in call_function_nodes
         )
         self.assertTrue(all_prims_namespace)
 
 
+    @onlyCUDA
+    @skipCUDAIfRocm
+    def test_nvfuser_executor_parameters(self, device):
+        from torch.fx.experimental.proxy_tensor import make_fx
+        from torch._prims.executor import execute
+
+        a = torch.randn(3, 4, device=device)
+
+        def func(a):
+            return torch.ops.nvprims.add(a, a)
+
+        gm = make_fx(func)(a)
+
+        expected = execute(gm, a, executor="aten")
+        # Shouldn't raise an error because unuseful parameters are ignored
+        params_dicts = [None, {}, {"none": None}]
+        for params in params_dicts:
+            actual = execute(gm, a, executor="nvfuser", executor_parameters=params)
+            self.assertEqual(expected, actual)
+
+        # Check caching parameter
+        for use_cache in [True, False]:
+            params = {"use_python_fusion_cache": use_cache}
+            actual = execute(gm, a, executor="nvfuser", executor_parameters=params)
+            self.assertEqual(expected, actual)
+
+        # Check allow_single_op_fusion parameter
+        for allow_single_op_fusion in [True, False]:
+            params = {"allow_single_op_fusion": allow_single_op_fusion}
+            actual = execute(gm, a, executor="nvfuser", executor_parameters=params)
+            self.assertEqual(expected, actual)
+
+
     @onlyCUDA
     @skipCUDAIfRocm
     def test_nvfuser_executor_partitioned(self, device):
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 5f889cff5367..bc837738aa9c 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -664,6 +664,26 @@ def f(x):
         self.assertFalse(has_proxy(torch.randn(5)))
         make_fx(f)(torch.randn(5))
 
+    def test_strides(self):
+        def f(x):
+            self.assertTrue(x.is_contiguous())
+            self.assertFalse(x.is_contiguous(memory_format=torch.channels_last))
+            x = x.permute(0, 3, 1, 2)
+            self.assertFalse(x.is_contiguous())
+            self.assertTrue(x.is_contiguous(memory_format=torch.channels_last))
+            return x
+        make_fx(f)(torch.randn(2, 3, 4, 5))
+
+        def f(x):
+            self.assertTrue(x.is_contiguous())
+            y = x[:, 1]
+            self.assertFalse(y.is_contiguous())
+            y = x[:, ::2]
+            self.assertFalse(y.is_contiguous())
+            return x.cos()
+
+        make_fx(f)(torch.randn(2, 3, 4, 5))
+
 class TestGenericProxyTensorReal(TestGenericProxyTensor):
     tracing_mode = "real"
 
@@ -732,12 +752,31 @@ def f(x, y):
         x, y = torch.randn(2), torch.randn(2)
         self.assertEqual(g(x, y), f(x, y))
 
+    def test_alias(self):
+        def f(x):
+            return torch.ops.aten.alias(x)
+
+        r = str(make_fx(f, tracing_mode="fake")(torch.randn(2)).code).strip()
+        # NB: this should not have a detach call
+        self.assertExpectedInline(r, """\
+def forward(self, x_1):
+    alias = torch.ops.aten.alias.default(x_1);  x_1 = None
+    return alias""")
+
 def _get_node(fx_g, cond):
     for n in fx_g.graph.nodes:
         if cond(n):
             return n
     raise AssertionError
 
+def _get_free_symbols(shape_env):
+    vars = tuple(shape_env.var_to_val.keys())
+    return len([var for var in vars if var not in shape_env.replacements])
+
+def _trace(f, *args):
+    inps = [torch.randn(arg) for arg in args]
+    return make_fx(f, tracing_mode="symbolic")(*inps)
+
 # TODO: Need to test the guards themselves specifically as well
 @skipIfNoSympy
 class TestSymbolicTracing(TestCase):
@@ -789,9 +828,7 @@ def forward(self, a_1):
     sym_size = torch.ops.aten.sym_size(a_1, 0);  a_1 = None
     mul = sym_size * 2;  sym_size = None
     empty = torch.ops.aten.empty.memory_format([mul], device = device(type='cpu'), pin_memory = False);  mul = None
-    sym_size_1 = torch.ops.aten.sym_size(empty, 0)
     detach = torch.ops.aten.detach.default(empty);  empty = None
-    sym_size_2 = torch.ops.aten.sym_size(detach, 0)
     return detach""")
 
     def test_symint_to_tensor(self):
@@ -803,18 +840,14 @@ def f(a):
 def forward(self, a_1):
     sym_size = torch.ops.aten.sym_size(a_1, 0)
     div = torch.ops.aten.div.Tensor(a_1, sym_size);  a_1 = sym_size = None
-    sym_size_1 = torch.ops.aten.sym_size(div, 0)
     return div""")
 
         r = str(make_fx(f, tracing_mode="symbolic", decomposition_table=decomposition_table)(torch.empty(4)).code).strip()
         self.assertExpectedInline(r, """\
 def forward(self, a_1):
     sym_size = torch.ops.aten.sym_size(a_1, 0)
-    sym_float = torch.fx.experimental.symbolic_shapes.sym_float(sym_size)
-    lt = sym_size < 0
-    eq = sym_size == sym_size;  sym_size = None
+    sym_float = torch.fx.experimental.symbolic_shapes.sym_float(sym_size);  sym_size = None
     div = torch.ops.prims.div.default(a_1, sym_float);  a_1 = sym_float = None
-    sym_size_1 = torch.ops.aten.sym_size(div, 0)
     return div""")
 
     def test_cat(self):
@@ -839,7 +872,6 @@ def f(a, b):
 
         self._test_dynamic(f, [(2, 4), (4, 5)], [[(2, 3), (5, 7)], [(3, 7), (9, 3)]], assert_eq=False)
 
-
     def test_expand(self):
         def f(a):
             b = torch.mul(a, a)
@@ -860,6 +892,70 @@ def f(a, b):
         meta_d = _get_node(fx_g, lambda x: x.target == operator.add)
         self.assertTrue(meta_c.meta['val'].shape[0].get_pyobj() == meta_d.meta['val'].expr)
 
+    def _assert_no_guards(self, fx_g, free_symbols):
+        self.assertEqual(_get_free_symbols(fx_g.shape_env), free_symbols)
+        self.assertEqual(len(fx_g.shape_env.get_nontrivial_guards()), 0)
+
+    def test_guards_equal(self):
+        def f(a, b):
+            return a * b
+
+        fx_g = _trace(f, (5, 5), (5, 5))
+        self._assert_no_guards(fx_g, 2)
+
+        fx_g = _trace(f, (5, 5, 5), (5, 5, 5))
+        self._assert_no_guards(fx_g, 3)
+
+        fx_g = _trace(f, (5, 1), (1, 5))
+        self._assert_no_guards(fx_g, 2)
+
+        def f(a, b, c, d):
+            a = a + b
+            cat = torch.cat([c, d])
+            return a + cat
+
+        fx_g = _trace(f, 7, 7, 4, 3)
+        self._assert_no_guards(fx_g, 2)
+
+        def f(a, b, c, d, e):
+            vals = [a, b, c, d, e]
+            x = a
+            for idx in range(len(vals) - 1):
+                x = torch.cat([x, vals[idx]]) + vals[idx + 1]
+            return x
+
+        fx_g = _trace(f, 2, 4, 8, 16, 32)
+        self._assert_no_guards(fx_g, 1)
+
+        def f(a, b):
+            a = a.view(b.shape[0])
+            return a + b.sum()
+
+        fx_g = _trace(f, (4, 2), 8)
+        self._assert_no_guards(fx_g, 2)
+
+        fx_g = _trace(f, (4, 2), (8, 4))
+        self._assert_no_guards(fx_g, 3)
+
+        fx_g = _trace(f, (2, 3, 4), 24)
+        self._assert_no_guards(fx_g, 3)
+
+    def test_nonidentity_transitive_guards(self):
+        def f(a, b, c, d, e):
+            vals = [a, b, c, d, e]
+            cat_vals = []
+            for idx in range(len(vals) - 1):
+                cat_vals.append(torch.cat([vals[idx], vals[idx]]))
+            final_vals = []
+            for a, b in reversed(list(zip(cat_vals, vals[1:]))):
+                final_vals.append(a + b)
+            return final_vals
+
+        fx_g = _trace(f, 2, 4, 8, 16, 32)
+        self._assert_no_guards(fx_g, 1)
+
+
+
 
 make_fx_failures = {
     # unknown
@@ -884,12 +980,13 @@ def f(a, b):
     xfail('corrcoef'),
     xfail('quantile'),
     xfail('nanquantile'),
+    xfail('narrow'),
 
     # Seems like it's creating a sparse tensor that isn't captured by tensor.is_sparse
     xfail('sparse.sampled_addmm'),
-
     # ???
-    xfail('nn.functional.ctc_loss'),
+    skip('nn.functional.ctc_loss'),  # sometimes it passes
+
     # proxy tensor doesn't support sparse correctly right now
     skip('to_sparse'),
     # segfaults
@@ -914,6 +1011,7 @@ def f(a, b):
     xfail('polar'),
     xfail('linalg.eig'),
     xfail('linalg.eigvals'),
+    skip('_masked.logsumexp', ''),  # Tensors of type TensorImpl do not have numel
     xfail('__getitem__', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('__rmatmul__', ''),  # aten.new_empty.default - couldn't find symbolic meta function/decomposition
     xfail('_masked.amax', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
@@ -938,7 +1036,7 @@ def f(a, b):
     xfail('addmm', 'decomposed'),  # aten.mm.default - couldn't find symbolic meta function/decomposition
     xfail('addmv', ''),  # aten.addmv.default - couldn't find symbolic meta function/decomposition
     xfail('addr', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
-    xfail('all', ''),  # Unexpected type <class 'torch.SymIntNode'> when computing elementwise type promotion!
+    xfail('all', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('aminmax', ''),  # aten.aminmax.default - couldn't find symbolic meta function/decomposition
     xfail('argmax', ''),  # aten.argmax.default - couldn't find symbolic meta function/decomposition
     xfail('argmin', ''),  # aten.argmin.default - couldn't find symbolic meta function/decomposition
@@ -948,16 +1046,10 @@ def f(a, b):
     xfail('as_strided_scatter', ''),  # aten.as_strided_scatter.default - couldn't find symbolic meta function/decomposition
     xfail('baddbmm', ''),  # aten.baddbmm.default - couldn't find symbolic meta function/decomposition
     xfail('bernoulli', ''),  # aten.bernoulli.default - couldn't find symbolic meta function/decomposition
-    xfail('bfloat16', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
     xfail('bmm', ''),  # aten.bmm.default - couldn't find symbolic meta function/decomposition
-    xfail('bool', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
-    xfail('broadcast_tensors', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('bucketize', ''),  # aten.bucketize.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('byte', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
     xfail('cartesian_prod', ''),  # Tensors of type TensorImpl do not have numel
     xfail('cdist', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
-    xfail('chalf', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
-    xfail('char', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
     xfail('cholesky_solve', ''),  # Could not run 'aten::_cholesky_solve_helper' with arguments from the 'Meta' back...
     xfail('chunk', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('clone', ''),  # aten.clone.default - couldn't find symbolic meta function/decomposition
@@ -976,9 +1068,7 @@ def f(a, b):
     xfail('diagonal_scatter', ''),  # aten.diagonal_scatter.default - couldn't find symbolic meta function/decomposition
     xfail('diff', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
     xfail('dist', ''),  # aten.dist.default - couldn't find symbolic meta function/decomposition
-    xfail('double', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
     xfail('dsplit', ''),  # aten.slice.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('eig', ''),  # aten.eig.default - couldn't find symbolic meta function/decomposition
     xfail('einsum', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('expand_as', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('fft.fft2', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
@@ -1004,13 +1094,11 @@ def f(a, b):
     xfail('fill', ''),  # The underlying op of 'aten.stride' has no overload name '_schema'
     xfail('flatten', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('unflatten', ''),  # RuntimeError: Trying to call aten.size on a tensor with symbolic shapes...
-    xfail('float', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
     xfail('frexp', ''),  # aten.frexp.Tensor - couldn't find symbolic meta function/decomposition
     xfail('full_like', ''),  # aten.full_like.default - couldn't find symbolic meta function/decomposition
     xfail('gather', ''),  # aten.gather.default - couldn't find symbolic meta function/decomposition
     xfail('geqrf', ''),  # aten.geqrf.default - couldn't find symbolic meta function/decomposition
     xfail('gradient', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
-    xfail('half', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
     xfail('histc', ''),  # Could not run 'aten::histc' with arguments from the 'Meta' backend. This could be because...
     xfail('histogram', ''),  # Could not run 'aten::histogram.bin_ct' with arguments from the 'Meta' backend. This c...
     xfail('histogramdd', ''),  # aten._histogramdd_bin_edges.default - couldn't find symbolic meta function/decomposition
@@ -1022,7 +1110,6 @@ def f(a, b):
     xfail('index_put', ''),  # aten.index_put.default - couldn't find symbolic meta function/decomposition
     xfail('index_reduce', ''),  # Float
     xfail('inner', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
-    xfail('int', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
     xfail('isclose', ''),  # The underlying op of 'aten.stride' has no overload name '_schema'
     xfail('isin', ''),  # aten.isin.Tensor_Tensor - couldn't find symbolic meta function/decomposition
     xfail('isreal', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
@@ -1072,7 +1159,6 @@ def f(a, b):
     xfail('logaddexp', ''),  # aten.logaddexp.default - couldn't find symbolic meta function/decomposition
     xfail('logcumsumexp', ''),  # aten.logcumsumexp.default - couldn't find symbolic meta function/decomposition
     xfail('logdet', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
-    xfail('long', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
     xfail('lu', ''),  # aten.linalg_lu_factor_ex.default - couldn't find symbolic meta function/decomposition
     xfail('lu_solve', ''),  # aten.linalg_lu_solve.default - couldn't find symbolic meta function/decomposition
     xfail('lu_unpack', ''),  # aten.lu_unpack.default - couldn't find symbolic meta function/decomposition
@@ -1089,7 +1175,6 @@ def f(a, b):
     xfail('mm', ''),  # aten.mm.default - couldn't find symbolic meta function/decomposition
     xfail('mode', ''),  # aten.mode.default - couldn't find symbolic meta function/decomposition
     xfail('msort', ''),  # aten.sort.default - couldn't find symbolic meta function/decomposition
-    xfail('mv', ''),  # aten.mv.default - couldn't find symbolic meta function/decomposition
     xfail('nanquantile', ''),  # Could not run 'aten::equal' with arguments from the 'Meta' backend.
     xfail('narrow', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.adaptive_avg_pool1d', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
@@ -1104,12 +1189,8 @@ def f(a, b):
     xfail('nn.functional.batch_norm', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.bilinear', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.binary_cross_entropy', ''),  # aten.new_empty.default - couldn't find symbolic meta function/decom...
-    xfail('nn.functional.binary_cross_entropy_with_logits', ''),  # aten.binary_cross_entropy_with_logits.default - couldn'...
     xfail('nn.functional.conv1d', ''),  # aten.convolution.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.conv2d', ''),  # aten.convolution.default - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.conv_transpose1d', ''),  # aten.convolution.default - couldn't find symbolic meta function/decompo...
-    xfail('nn.functional.conv_transpose2d', ''),  # aten.convolution.default - couldn't find symbolic meta function/decompo...
-    xfail('nn.functional.conv_transpose3d', ''),  # aten.convolution.default - couldn't find symbolic meta function/decompo...
     xfail('nn.functional.cosine_embedding_loss', ''),  # The underlying op of 'aten.stride' has no overload name '_schema'
     xfail('nn.functional.cosine_similarity', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.cross_entropy', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
@@ -1125,7 +1206,6 @@ def f(a, b):
     xfail('nn.functional.grid_sample', ''),  # aten.grid_sampler_2d.default - couldn't find symbolic meta function/decompos...
     xfail('nn.functional.group_norm', ''),  # 'torch._C.SymIntNode' and 'int'
     xfail('nn.functional.hinge_embedding_loss', ''),  # aten.empty_like.default - couldn't find symbolic meta function/deco...
-    xfail('nn.functional.huber_loss', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.instance_norm', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.interpolate', 'area'),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.interpolate', 'bicubic'),  # aten.upsample_bicubic2d.vec - couldn't find symbolic meta function/d...
@@ -1133,7 +1213,6 @@ def f(a, b):
     xfail('nn.functional.interpolate', 'linear'),  # aten.upsample_linear1d.vec - couldn't find symbolic meta function/dec...
     xfail('nn.functional.interpolate', 'nearest'),  # aten.upsample_nearest1d.vec - couldn't find symbolic meta function/d...
     xfail('nn.functional.interpolate', 'trilinear'),  # aten.upsample_trilinear3d.vec - couldn't find symbolic meta functi...
-    xfail('nn.functional.l1_loss', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.linear', ''),  # aten.mv.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.local_response_norm', ''),  # Tensors of type TensorImpl do not have numel
     xfail('nn.functional.margin_ranking_loss', ''),  # The underlying op of 'aten.stride' has no overload name '_schema'
@@ -1143,7 +1222,6 @@ def f(a, b):
     xfail('nn.functional.max_unpool1d', 'grad'),  # aten.max_unpool2d.default - couldn't find symbolic meta function/decom...
     xfail('nn.functional.max_unpool2d', 'grad'),  # aten.max_unpool2d.default - couldn't find symbolic meta function/decom...
     xfail('nn.functional.max_unpool3d', 'grad'),  # aten.max_unpool3d.default - couldn't find symbolic meta function/decom...
-    xfail('nn.functional.mse_loss', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.multi_margin_loss', ''),  # Could not run 'aten::multi_margin_loss' with arguments from the...
     xfail('nn.functional.multilabel_margin_loss', ''),  # Could not run 'aten::multilabel_margin_loss_forward' with ...
     xfail('nn.functional.normalize', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
@@ -1187,7 +1265,6 @@ def f(a, b):
     xfail('resize_', ''),  # aten.clone.default - couldn't find symbolic meta function/decomposition
     xfail('resize_as_', ''),  # aten.clone.default - couldn't find symbolic meta function/decomposition
     xfail('roll', ''),  # Tensors of type TensorImpl do not have numel
-    xfail('rot90', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
     xfail('round', ''),  # aten.round.default - couldn't find symbolic meta function/decomposition
     xfail('round', 'decimals_0'),  # aten.round.decimals - couldn't find symbolic meta function/decomposition
     xfail('round', 'decimals_3'),  # aten.round.decimals - couldn't find symbolic meta function/decomposition
@@ -1203,9 +1280,6 @@ def f(a, b):
     xfail('segment_reduce', 'offsets'),  # aten.segment_reduce.default - couldn't find symbolic meta function/decomposition
     xfail('select', ''),  # aten.select.int - couldn't find symbolic meta function/decomposition
     xfail('select_scatter', ''),  # aten.select_scatter.default - couldn't find symbolic meta function/decomposition
-    xfail('sgn', ''),  # aten.sgn.default - couldn't find symbolic meta function/decomposition
-    xfail('short', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
-    xfail('sinc', ''),  # aten.sinc.default - couldn't find symbolic meta function/decomposition
     xfail('slice_scatter', ''),  # aten.slice_scatter.default - couldn't find symbolic meta function/decomposition
     xfail('sort', ''),  # aten.sort.default - couldn't find symbolic meta function/decomposition
     xfail('special.airy_ai', ''),  # aten.special_airy_ai.default - couldn't find symbolic meta function/decomposition
@@ -1258,7 +1332,6 @@ def f(a, b):
     xfail('unbind', ''),  # aten.unbind.int - couldn't find symbolic meta function/decomposition
 }
 symbolic_tensor_segfaults = {
-    skip('_masked.logsumexp', ''),  # Tensors of type TensorImpl do not have numel
 }
 
 symbolic_tensor_failures.update(symbolic_tensor_segfaults)
diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py
index ba65bb5203ed..da2d326d6a49 100644
--- a/test/test_python_dispatch.py
+++ b/test/test_python_dispatch.py
@@ -1833,5 +1833,11 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
             e = LayoutDefaultReturn(torch.randn(4, 2), use_wrapper_subclass)
             self.assertEqual(e.layout, torch.strided)
 
+class TestPythonDispatcher(TestCase):
+    def test_basic(self):
+        x = torch.randn(2, requires_grad=True)
+        r = torch._C._EnablePythonDispatcher()
+        torch.add(x, x)
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_sort_and_select.py b/test/test_sort_and_select.py
index 70ec86f27ed6..bc320ddd4839 100644
--- a/test/test_sort_and_select.py
+++ b/test/test_sort_and_select.py
@@ -10,7 +10,7 @@
 from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import all_types, all_types_and, floating_types_and
 from torch.testing._internal.common_utils import \
-    (TestCase, run_tests, skipIfTorchDynamo, slowTest)
+    (TestCase, run_tests, slowTest)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, dtypes, onlyNativeDeviceTypes,
      onlyCUDA, dtypesIfCUDA, dtypesIfCPU, onlyCPU, largeTensorTest)
@@ -357,7 +357,6 @@ def test(shape):
         for shape in shapes:
             test(shape)
 
-    @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/982")
     def test_topk(self, device):
         def topKViaSort(t, k, dim, dir):
             sorted, indices = t.sort(dim, dir)
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 30ef3e98a4ae..4253fc99c063 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -9,7 +9,7 @@
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import TestCase, run_tests, skipIfRocm, do_test_dtypes, \
     do_test_empty_full, load_tests, TEST_NUMPY, TEST_SCIPY, IS_WINDOWS, gradcheck, coalescedonoff, \
-    DeterministicGuard, first_sample, TEST_WITH_CROSSREF, TEST_WITH_ROCM
+    DeterministicGuard, first_sample, TEST_WITH_CROSSREF, TEST_WITH_ROCM, skipIfTorchDynamo
 from torch.testing._internal.common_cuda import TEST_CUDA, _get_torch_cuda_version
 from numbers import Number
 from typing import Dict, Any
@@ -909,6 +909,7 @@ def test_shape(di, dj, dk, nnz):
         test_shape(10, 20, 0, 0)
         test_shape(10, 20, 0, 20)
 
+    @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/1166")
     @dtypes(torch.double, torch.cdouble)
     def test_t_empty(self, device, dtype):
         def test_in_place(x):
@@ -3330,6 +3331,7 @@ def softmax_jacobian_autograd(x, dim, log=False):
                 J[i] = g.to_dense() if g.is_sparse else g
             return J
 
+        @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/1166")
         def test_op(sparse_dims, nnz, with_size, coalesced):
             if isinstance(with_size, Number):
                 with_size = [with_size] * sparse_dims
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index 0ba587ce6e11..e457e4b956e7 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -920,7 +920,7 @@ def test_csr_storage(self):
     def test_csr_is_contiguous(self):
         a = self.genSparseCSRTensor((3, 3), 3, dtype=torch.float, device=self.device_type, index_dtype=torch.int64)
 
-        with self.assertRaisesRegex(RuntimeError, "Tensors of type SparseCsrTensorImpl do not have is_contiguous"):
+        with self.assertRaisesRegex(RuntimeError, "Sparse CSR tensors do not have is_contiguous"):
             a.is_contiguous()
 
     def test_csr_double_to_sparse_csr(self):
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index 71bb03b6d6c4..aab264524969 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -491,10 +491,6 @@ def test_cat_empty_legacy(self, device):
         res1 = torch.cat([empty, empty], dim=1)
         self.assertEqual(res1, empty)
 
-        with self.assertRaisesRegex(RuntimeError,
-                                    'non-empty list of Tensors'):
-            torch.cat([], dim=1)
-
     def test_cat_empty(self, device):
         dtype = torch.float32
 
@@ -508,39 +504,10 @@ def test_cat_empty(self, device):
         res1 = torch.cat([empty, empty], dim=1)
         self.assertEqual(res1, empty)
 
-        # check non-legacy-behavior (sizes don't match)
-        empty = torch.randn((4, 0, 31, 32), dtype=dtype, device=device)
-        self.assertRaises(RuntimeError, lambda: torch.cat([x, empty], dim=1))
-        self.assertRaises(RuntimeError, lambda: torch.cat([empty, x], dim=1))
-
-        # check non-legacy-behavior (dimensions don't match)
-        empty = torch.randn((4, 0), dtype=dtype, device=device)
-        self.assertRaises(RuntimeError, lambda: torch.cat([x, empty], dim=1))
-        self.assertRaises(RuntimeError, lambda: torch.cat([empty, x], dim=1))
-
     def test_cat_out(self, device):
         x = torch.zeros((0), device=device)
         y = torch.randn((4, 6), device=device)
 
-        with self.assertRaisesRegex(
-                RuntimeError,
-                r"unsupported operation: some elements of the input tensor and "
-                r"the written-to tensor refer to a single memory location."):
-            torch.cat([x, y], dim=0, out=x)
-
-        with self.assertRaisesRegex(
-                RuntimeError,
-                r"unsupported operation: some elements of the input tensor and "
-                r"the written-to tensor refer to a single memory location."):
-            torch.cat([x, y], dim=0, out=y)
-
-        z = torch.zeros((4, 6), device=device)
-        with self.assertRaisesRegex(
-                RuntimeError,
-                r"unsupported operation: some elements of the input tensor and "
-                r"the written-to tensor refer to a single memory location."):
-            torch.cat([y, z], out=z[:2, :])
-
         w = y.view(-1).clone()
         a = torch.cat([w[:2], w[4:6]])
         b = torch.cat([w[:2], w[4:6]], out=w[6:10])
@@ -661,32 +628,11 @@ def test_cat_out_memory_format(self, device):
 
         self.assertTrue(res3_cuda.is_contiguous(memory_format=torch.channels_last))
 
-    @onlyCUDA
-    @deviceCountAtLeast(2)
-    def test_cat_different_devices(self, devices):
-        cuda0 = torch.randn((3, 3), device=devices[0])
-        cuda1 = torch.randn((3, 3), device=devices[1])
-        with self.assertRaisesRegex(RuntimeError,
-                                    "Expected all tensors to be on the same device"):
-            torch.cat((cuda0, cuda1))
-
-        with self.assertRaisesRegex(RuntimeError,
-                                    "Expected all tensors to be on the same device"):
-            torch.cat((cuda0, cuda0), out=cuda1)
-
     @onlyCUDA
     def test_cat_stack_cross_devices(self, device):
         cuda = torch.randn((3, 3), device=device)
         cpu = torch.randn((3, 3), device='cpu')
 
-        # cat
-        with self.assertRaisesRegex(RuntimeError,
-                                    "Expected all tensors to be on the same device"):
-            torch.cat((cuda, cpu))
-        with self.assertRaisesRegex(RuntimeError,
-                                    "Expected all tensors to be on the same device"):
-            torch.cat((cpu, cuda))
-
         # Stack
         with self.assertRaisesRegex(RuntimeError,
                                     "Expected all tensors to be on the same device"):
@@ -1059,18 +1005,6 @@ def test_cat_big(self, device):
         result = torch.cat(concat_list)
         self.assertEqual(result.size(0), SIZE1 + SIZE2)
 
-    @onlyCPU
-    def test_cat_bad_input_sizes(self, device):
-        x = torch.randn(2, 1, device=device)
-        y = torch.randn(2, 1, 1, device=device)
-        z = torch.randn(2, 1, 1, device=device)
-        self.assertRaises(RuntimeError, lambda: torch.cat([x, y, z]))
-
-        x = torch.randn(2, 1, 2, device=device)
-        y = torch.randn(2, 1, 1, device=device)
-        z = torch.randn(2, 2, 1, device=device)
-        self.assertRaises(RuntimeError, lambda: torch.cat([x, y, z], dim=1))
-
     @onlyCPU
     @dtypes(torch.half, torch.double, torch.int)
     def test_cat2(self, device, dtype):
@@ -1094,20 +1028,6 @@ def test_cat2(self, device, dtype):
         z = torch.cat([x, y])
         self.assertEqual(z.size(), (21, SIZE, SIZE))
 
-        self.assertRaises(RuntimeError, lambda: torch.cat([]))
-        self.assertRaisesRegex(TypeError, 'got None', lambda: torch.cat([x, None]))
-
-    @onlyCPU
-    def test_cat_scalars(self, device):
-        x = torch.tensor(0, device=device)
-        y = torch.tensor(1, device=device)
-        with self.assertRaisesRegex(RuntimeError, 'zero-dimensional.*cannot be concatenated'):
-            torch.cat([x, y])
-
-    def test_zeros_dtype_out_match(self, device):
-        d = torch.tensor((2, 3), device=device, dtype=torch.double)
-        self.assertRaises(RuntimeError, lambda: torch.zeros((2, 3), device=device, dtype=torch.float32, out=d))
-
     # FIXME: Create an OpInfo-based tensor creation method test that verifies this for all tensor
     #   creation methods and verify all dtypes and layouts
     @dtypes(torch.bool, torch.uint8, torch.int16, torch.int64, torch.float16, torch.float32, torch.complex64)
diff --git a/test/test_torch.py b/test/test_torch.py
index 6ede96b6e9a2..ae302c1a20d2 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -47,7 +47,7 @@
     dtypes, dtypesIfCUDA, dtypesIfCPU, deviceCountAtLeast,
     skipMeta,
     PYTORCH_CUDA_MEMCHECK, largeTensorTest, onlyNativeDeviceTypes,
-    expectedAlertNondeterministic, get_all_device_types, skipXLA)
+    get_all_device_types, skipXLA)
 from typing import Tuple
 import torch.backends.quantized
 import torch.testing._internal.data
@@ -1158,11 +1158,10 @@ def test_nondeterministic_alert_AvgPool3d(self, device):
         res = module(input)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('avg_pool3d_backward_cuda', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad)
-
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad, retain_graph=True),
+            'avg_pool3d_backward_cuda',
+            torch.device(device).type == 'cuda')
 
     @skipIfMps
     def test_nondeterministic_alert_AdaptiveAvgPool2d(self, device):
@@ -1171,11 +1170,10 @@ def test_nondeterministic_alert_AdaptiveAvgPool2d(self, device):
         res = module(input)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('adaptive_avg_pool2d_backward_cuda', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad)
-
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad, retain_graph=True),
+            'adaptive_avg_pool2d_backward_cuda',
+            torch.device(device).type == 'cuda')
 
     @skipIfMps
     def test_nondeterministic_alert_AdaptiveAvgPool3d(self, device):
@@ -1184,11 +1182,10 @@ def test_nondeterministic_alert_AdaptiveAvgPool3d(self, device):
         res = module(input)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('adaptive_avg_pool3d_backward_cuda', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad)
-
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad, retain_graph=True),
+            'adaptive_avg_pool3d_backward_cuda',
+            torch.device(device).type == 'cuda')
 
     @skipIfMps
     def test_nondeterministic_alert_MaxPool3d(self, device):
@@ -1197,11 +1194,10 @@ def test_nondeterministic_alert_MaxPool3d(self, device):
         res = module(input)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('max_pool3d_with_indices_backward_cuda', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad)
-
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad, retain_graph=True),
+            'max_pool3d_with_indices_backward_cuda',
+            torch.device(device).type == 'cuda')
 
     @skipIfMps
     def test_nondeterministic_alert_AdaptiveMaxPool2d(self, device):
@@ -1210,11 +1206,10 @@ def test_nondeterministic_alert_AdaptiveMaxPool2d(self, device):
         res = module(input)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('adaptive_max_pool2d_backward_cuda', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad)
-
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad, retain_graph=True),
+            'adaptive_max_pool2d_backward_cuda',
+            torch.device(device).type == 'cuda')
 
     @skipIfMps
     def test_nondeterministic_alert_FractionalMaxPool2d(self, device):
@@ -1223,11 +1218,10 @@ def test_nondeterministic_alert_FractionalMaxPool2d(self, device):
         res = module(input)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('fractional_max_pool2d_backward_cuda', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad)
-
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad, retain_graph=True),
+            'fractional_max_pool2d_backward_cuda',
+            torch.device(device).type == 'cuda')
 
     @skipIfMps
     def test_nondeterministic_alert_FractionalMaxPool3d(self, device):
@@ -1236,11 +1230,52 @@ def test_nondeterministic_alert_FractionalMaxPool3d(self, device):
         res = module(input)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('fractional_max_pool3d_backward_cuda', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad, retain_graph=True),
+            'fractional_max_pool3d_backward_cuda',
+            torch.device(device).type == 'cuda')
+
+    @dtypes(*floating_types_and(torch.half))
+    @onlyNativeDeviceTypes
+    def test_nondeterministic_alert_MaxUnpool1d(self, device, dtype):
+        if dtype == torch.half and torch.device(device).type == 'cpu':
+            self.skipTest('float16 not implemented on CPU')
+
+        module = torch.nn.MaxUnpool1d(3, 1)
+        input = torch.randn(1, 1, 7, dtype=dtype, device=device)
+        indices = torch.zeros_like(input, dtype=torch.long, device=device)
+
+        self.check_nondeterministic_alert(
+            lambda: module(input, indices),
+            'max_unpooling2d_forward_out')
+
+    @dtypes(*floating_types_and(torch.half))
+    @onlyNativeDeviceTypes
+    def test_nondeterministic_alert_MaxUnpool2d(self, device, dtype):
+        if dtype == torch.half and torch.device(device).type == 'cpu':
+            self.skipTest('float16 not implemented on CPU')
+
+        module = torch.nn.MaxUnpool2d(3, 1)
+        input = torch.randn(1, 1, 7, 7, dtype=dtype, device=device)
+        indices = torch.zeros_like(input, dtype=torch.long, device=device)
 
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: module(input, indices),
+            'max_unpooling2d_forward_out')
+
+    @dtypes(*floating_types_and(torch.half))
+    @onlyNativeDeviceTypes
+    def test_nondeterministic_alert_MaxUnpool3d(self, device, dtype):
+        if dtype == torch.half and torch.device(device).type == 'cpu':
+            self.skipTest('float16 not implemented on CPU')
+
+        module = torch.nn.MaxUnpool3d(3, 1)
+        input = torch.randn(1, 1, 7, 7, 7, dtype=dtype, device=device)
+        indices = torch.zeros_like(input, dtype=torch.long, device=device)
+
+        self.check_nondeterministic_alert(
+            lambda: module(input, indices),
+            'max_unpooling3d_forward_out')
 
     @skipIfMps
     def test_nondeterministic_alert_interpolate_linear(self, device):
@@ -1252,11 +1287,10 @@ def test_nondeterministic_alert_interpolate_linear(self, device):
             align_corners=False)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('upsample_linear1d_backward_out_cuda', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad)
-
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad),
+            'upsample_linear1d_backward_out_cuda',
+            torch.device(device).type == 'cuda')
 
     def test_nondeterministic_alert_interpolate_bilinear(self, device):
         input = torch.randn(1, 2, 4, 4, device=device, requires_grad=True)
@@ -1267,11 +1301,10 @@ def test_nondeterministic_alert_interpolate_bilinear(self, device):
             align_corners=False)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('upsample_bilinear2d_backward_out_cuda', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad)
-
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad),
+            'upsample_bilinear2d_backward_out_cuda',
+            torch.device(device).type == 'cuda')
 
     @skipIfMps
     def test_nondeterministic_alert_interpolate_bicubic(self, device):
@@ -1283,11 +1316,10 @@ def test_nondeterministic_alert_interpolate_bicubic(self, device):
             align_corners=False)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('upsample_bicubic2d_backward_out_cuda', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad)
-
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad),
+            'upsample_bicubic2d_backward_out_cuda',
+            torch.device(device).type == 'cuda')
 
     @skipIfMps
     def test_nondeterministic_alert_interpolate_trilinear(self, device):
@@ -1299,11 +1331,10 @@ def test_nondeterministic_alert_interpolate_trilinear(self, device):
             align_corners=False)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('upsample_trilinear3d_backward_out_cuda', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad)
-
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad),
+            'upsample_trilinear3d_backward_out_cuda',
+            torch.device(device).type == 'cuda')
 
     @skipIfMps
     def test_nondeterministic_alert_ReflectionPad1d(self, device):
@@ -1312,11 +1343,10 @@ def test_nondeterministic_alert_ReflectionPad1d(self, device):
         res = module(input)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('reflection_pad1d_backward_out_cuda', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad)
-
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad, retain_graph=True),
+            'reflection_pad1d_backward_out_cuda',
+            torch.device(device).type == 'cuda')
 
     def test_nondeterministic_alert_ReflectionPad2d(self, device):
         module = torch.nn.ReflectionPad2d((1, 2, 3, 4))
@@ -1324,11 +1354,10 @@ def test_nondeterministic_alert_ReflectionPad2d(self, device):
         res = module(input)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('reflection_pad2d_backward_cuda', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad)
-
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad, retain_graph=True),
+            'reflection_pad2d_backward_cuda',
+            torch.device(device).type == 'cuda')
 
     @skipIfMps
     def test_nondeterministic_alert_ReflectionPad3d(self, device):
@@ -1337,11 +1366,10 @@ def test_nondeterministic_alert_ReflectionPad3d(self, device):
         res = module(input)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('reflection_pad3d_backward_out_cuda', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad)
-
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad, retain_graph=True),
+            'reflection_pad3d_backward_out_cuda',
+            torch.device(device).type == 'cuda')
 
     @skipIfMps
     def test_nondeterministic_alert_ReplicationPad1d(self, device):
@@ -1350,11 +1378,10 @@ def test_nondeterministic_alert_ReplicationPad1d(self, device):
         res = module(input)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('replication_pad1d_backward_cuda', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad)
-
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad, retain_graph=True),
+            'replication_pad1d_backward_cuda',
+            torch.device(device).type == 'cuda')
 
     def test_nondeterministic_alert_ReplicationPad2d(self, device):
         module = torch.nn.ReplicationPad2d((1, 2, 3, 4))
@@ -1362,11 +1389,10 @@ def test_nondeterministic_alert_ReplicationPad2d(self, device):
         res = module(input)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('replication_pad2d_backward_cuda', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad)
-
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad, retain_graph=True),
+            'replication_pad2d_backward_cuda',
+            torch.device(device).type == 'cuda')
 
     @skipIfMps
     def test_nondeterministic_alert_ReplicationPad3d(self, device):
@@ -1375,22 +1401,21 @@ def test_nondeterministic_alert_ReplicationPad3d(self, device):
         res = module(input)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('replication_pad3d_backward_cuda', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad)
-
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad, retain_graph=True),
+            'replication_pad3d_backward_cuda',
+            torch.device(device).type == 'cuda')
 
     def test_nondeterministic_alert_NLLLoss(self, device):
         module = torch.nn.NLLLoss()
         input = torch.randn(2, 3, 5, 5, device=device)
         target = torch.rand(2, 5, 5, device=device).mul(3).floor().long()
 
-        @expectedAlertNondeterministic('nll_loss2d_forward_out_cuda_template', ['cuda'])
-        def forward_func(slf, device):
-            module(input, target)
 
-        forward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: module(input, target),
+            'nll_loss2d_forward_out_cuda_template',
+            torch.device(device).type == 'cuda')
 
     def test_nondeterministic_alert_CTCLoss(self, device):
         module = torch.nn.CTCLoss()
@@ -1401,11 +1426,10 @@ def test_nondeterministic_alert_CTCLoss(self, device):
         res = module(input, target, input_lengths, target_lengths)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('ctc_loss_backward_gpu', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad, retain_graph=True)
-
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad, retain_graph=True),
+            'ctc_loss_backward_gpu',
+            torch.device(device).type == 'cuda')
 
     def test_nondeterministic_alert_EmbeddingBag_max(self, device):
         module = torch.nn.EmbeddingBag(
@@ -1415,96 +1439,67 @@ def test_nondeterministic_alert_EmbeddingBag_max(self, device):
         res = module(input)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('embedding_bag_backward_cuda_max', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad)
-
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad, retain_graph=True),
+            'embedding_bag_backward_cuda_max',
+            torch.device(device).type == 'cuda')
 
     @dtypes(*all_types_and_complex_and(torch.bool))
     def test_nondeterministic_alert_cumsum(self, device, dtype):
+        input = make_tensor((10,), dtype=dtype, device=device, low=-9, high=9)
+        should_alert = torch.device(device).type == 'cuda' and (dtype.is_floating_point or dtype.is_complex)
 
-        def test_func(op_call):
-            input = make_tensor((10,), dtype=dtype, device=device, low=-9, high=9)
-
-            @expectedAlertNondeterministic('cumsum_cuda_kernel', ['cuda'])
-            def forward_func_alert(slf, device):
-                op_call(input, 0)
-
-            if dtype.is_floating_point or dtype.is_complex:
-                forward_func_alert(self, device)
-            else:
-                with DeterministicGuard(True):
-                    op_call(input, 0)
-
-        test_func(torch.Tensor.cumsum)
-        test_func(torch.cumsum)
+        for op_call in [torch.Tensor.cumsum, torch.cumsum]:
+            self.check_nondeterministic_alert(
+                lambda: op_call(input, 0),
+                'cumsum_cuda_kernel',
+                should_alert)
 
     @expectedFailureMeta  # expected a non-determinitic error, but it was not raised
     @onlyNativeDeviceTypes
     def test_nondeterministic_alert_put(self, device):
-        def test_func(op_call):
-            a = torch.randn(10, device=device)
-            indices = torch.tensor([0, 0], device=device)
-            values = torch.tensor([0., 1.], device=device)
-
-            @expectedAlertNondeterministic('put_')
-            def forward_func(slf, device):
-                op_call(a, indices, values, accumulate=False)
-
-            forward_func(self, device)
+        a = torch.randn(10, device=device)
+        indices = torch.tensor([0, 0], device=device)
+        values = torch.tensor([0., 1.], device=device)
 
-        test_func(torch.Tensor.put)
-        test_func(torch.Tensor.put_)
+        for op_call in [torch.Tensor.put, torch.Tensor.put_]:
+            self.check_nondeterministic_alert(
+                lambda: op_call(a, indices, values, accumulate=False),
+                'put_')
 
     def test_nondeterministic_alert_put_accumulate(self, device):
-        def test_func(op_call):
-            a = torch.randn(10, device=device)
-            indices = torch.tensor([0, 0], device=device)
-            values = torch.tensor([0., 1.], device=device)
+        a = torch.randn(10, device=device)
+        indices = torch.tensor([0, 0], device=device)
+        values = torch.tensor([0., 1.], device=device)
 
-            @expectedAlertNondeterministic('put_', ['cuda'])
-            def forward_func(slf, device):
-                op_call(a, indices, values, accumulate=True)
-
-            forward_func(self, device)
-
-        test_func(torch.Tensor.put)
-        test_func(torch.Tensor.put_)
+        for op_call in [torch.Tensor.put, torch.Tensor.put_]:
+            self.check_nondeterministic_alert(
+                lambda: op_call(a, indices, values, accumulate=True),
+                'put_',
+                torch.device(device).type == 'cuda')
 
     @skipIfMps
     def test_nondeterministic_alert_histc(self, device):
-        def test_func(op_call):
-            a = torch.tensor([], device=device)
-
-            @expectedAlertNondeterministic('_histc_cuda', ['cuda'])
-            def forward_func(slf, device):
-                res = op_call(a, min=0, max=3)
-
-            forward_func(self, device)
-
-        test_func(torch.histc)
-        test_func(torch.Tensor.histc)
+        a = torch.tensor([], device=device)
+        for op_call in [torch.histc, torch.Tensor.histc]:
+            self.check_nondeterministic_alert(
+                lambda: op_call(a, min=0, max=3),
+                '_histc_cuda',
+                torch.device(device).type == 'cuda')
 
     @skipIfMps
     def test_nondeterministic_alert_bincount(self, device):
-        def test_func(op_call):
-            a = torch.tensor([], device=device, dtype=torch.long)
-
-            @expectedAlertNondeterministic('_bincount_cuda', ['cuda'])
-            def forward_func(slf, device):
-                res = op_call(a)
-
-            forward_func(self, device)
-
-        test_func(torch.bincount)
-        test_func(torch.Tensor.bincount)
+        a = torch.tensor([], device=device, dtype=torch.long)
+        for op_call in [torch.bincount, torch.Tensor.bincount]:
+            self.check_nondeterministic_alert(
+                lambda: op_call(a),
+                '_bincount_cuda',
+                torch.device(device).type == 'cuda')
 
     # Ensures that kthvalue throws nondeterministic alerts in the correct cases
     @dtypes(torch.double)
     def test_nondeterministic_alert_kthvalue(self, device, dtype):
-        @expectedAlertNondeterministic('kthvalue CUDA', ['cuda'])
-        def test_func(slf, device, call_type):
+        def test_func(call_type):
             S = 10
             k = 5
             a = torch.randn(S, device=device)
@@ -1519,9 +1514,11 @@ def test_func(slf, device, call_type):
             else:
                 self.fail(f"'{call_type}' is not a valid call type")
 
-        test_func(self, device, 'function')
-        test_func(self, device, 'method')
-        test_func(self, device, 'out')
+        for call_type in ['function', 'method', 'out']:
+            self.check_nondeterministic_alert(
+                lambda: test_func('function'),
+                'kthvalue CUDA',
+                torch.device(device).type == 'cuda')
 
     @skipIfMps
     def test_nondeterministic_alert_grid_sample_2d(self, device):
@@ -1530,11 +1527,10 @@ def test_nondeterministic_alert_grid_sample_2d(self, device):
         res = torch.nn.functional.grid_sample(input, grid, align_corners=False)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('grid_sampler_2d_backward_cuda', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad)
-
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad, retain_graph=True),
+            'grid_sampler_2d_backward_cuda',
+            torch.device(device).type == 'cuda')
 
     @skipIfMps
     def test_nondeterministic_alert_grid_sample_3d(self, device):
@@ -1543,11 +1539,10 @@ def test_nondeterministic_alert_grid_sample_3d(self, device):
         res = torch.nn.functional.grid_sample(input, grid, align_corners=False)
         grad = torch.ones_like(res)
 
-        @expectedAlertNondeterministic('grid_sampler_3d_backward_cuda', ['cuda'])
-        def backward_func(slf, device):
-            res.backward(grad)
-
-        backward_func(self, device)
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad, retain_graph=True),
+            'grid_sampler_3d_backward_cuda',
+            torch.device(device).type == 'cuda')
 
     def test_invalid_shapes_grid_sampler(self, device):
         make_arg = partial(
@@ -1616,7 +1611,7 @@ def run_test(x, y):
     # Ensures that median throws nondeterministic alerts in the correct cases
     @dtypes(torch.double)
     def test_nondeterministic_alert_median(self, device, dtype):
-        def test_func(slf, device, call_type):
+        def test_func(call_type):
             S = 10
             a = torch.randn(S, device=device)
             if call_type == 'function':
@@ -1634,15 +1629,19 @@ def test_func(slf, device, call_type):
             else:
                 self.fail(f"'{call_type}' is not a valid call type")
 
-        @expectedAlertNondeterministic('median CUDA with indices output', ['cuda'])
-        def test_func_expect_error(slf, device, call_type):
-            test_func(slf, device, call_type)
+        def test_func_expect_error(call_type, should_error):
+            self.check_nondeterministic_alert(
+                lambda: test_func(call_type),
+                'median CUDA with indices output',
+                should_error)
 
-        test_func(self, device, 'function')
-        test_func_expect_error(self, device, 'function with indices')
-        test_func(self, device, 'method')
-        test_func_expect_error(self, device, 'method with indices')
-        test_func_expect_error(self, device, 'out with indices')
+        is_cuda = torch.device(device).type == 'cuda'
+
+        test_func_expect_error('function', False)
+        test_func_expect_error('function with indices', is_cuda)
+        test_func_expect_error('method', False)
+        test_func_expect_error('method with indices', is_cuda)
+        test_func_expect_error('out with indices', is_cuda)
 
     # FIXME: move to test_scatter_gather_ops
     def _test_gather_backward_one_dim(self, device, deterministic: bool = False) -> None:
@@ -4030,6 +4029,7 @@ def test_copy_mem_overlap(self, device, dtype):
             doubles, sz, lambda input, out: out.copy_(input))
 
     # FIXME: convert to ErrorInputs
+    # (but have to extend ErrorInputs to handle inplace-only errors!)
     @onlyNativeDeviceTypes
     def test_index_add_mem_overlap(self, device):
         x = torch.rand((1,), device=device).expand((6,))
@@ -4046,6 +4046,7 @@ def test_index_add_mem_overlap(self, device):
             ind.index_add_(0, ind.clone(), ind)
 
     # FIXME: convert to ErrorInputs
+    # (but have to extend ErrorInputs to handle inplace-only errors!)
     @onlyNativeDeviceTypes
     def test_index_copy_mem_overlap(self, device):
         x = torch.rand((1,), device=device).expand((6,))
@@ -4062,6 +4063,7 @@ def test_index_copy_mem_overlap(self, device):
             ind.index_copy_(0, ind.clone(), ind)
 
     # FIXME: convert to ErrorInputs
+    # (but have to extend ErrorInputs to handle inplace-only errors!)
     @expectedFailureMeta  # Warning not triggered
     @onlyNativeDeviceTypes
     def test_index_fill_mem_overlap(self, device):
@@ -4086,6 +4088,7 @@ def test_shift_mem_overlap(self, device):
             x[:-1] >>= x[1:]
 
     # FIXME: convert to ErrorInputs
+    # (but have to extend ErrorInputs to handle inplace-only errors)
     @expectedFailureMeta  # RuntimeError not raised
     @onlyNativeDeviceTypes
     def test_bernoulli_mem_overlap(self, device):
@@ -4098,10 +4101,9 @@ def test_bernoulli_mem_overlap(self, device):
         p = torch.rand(6, device=device)
         with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
             x.bernoulli_(p=p)
-        with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
-            torch.bernoulli(torch.rand_like(x), out=x)
 
     # FIXME: convert to ErrorInputs
+    # (but have to extend ErrorInputs to handle inplace-only errors!)
     @expectedFailureMeta  # RuntimeError not raised
     @onlyNativeDeviceTypes
     def test_put_mem_overlap(self, device):
@@ -4123,6 +4125,7 @@ def test_put_mem_overlap(self, device):
             ind.put_(ind.clone(), ind)
 
     # FIXME: convert to ErrorInputs
+    # (but have to extend ErrorInputs to handle inplace-only errors!)
     @expectedFailureMeta  # UserWarning not triggered
     @onlyNativeDeviceTypes
     def test_index_put_mem_overlap(self, device):
@@ -4144,6 +4147,7 @@ def test_index_put_mem_overlap(self, device):
             ind.index_put_((ind.clone(),), ind)
 
     # FIXME: convert to ErrorInputs
+    # (but have to extend ErrorInputs to handle inplace-only errors!)
     @expectedFailureMeta  # UserWarning not triggered
     @onlyNativeDeviceTypes
     def test_masked_fill_mem_overlap(self, device):
@@ -4160,6 +4164,7 @@ def test_masked_fill_mem_overlap(self, device):
             mask[1:].masked_fill_(mask[:-1], False)
 
     # FIXME: convert to ErrorInputs
+    # (but have to extend ErrorInputs to handle inplace-only errors!)
     @expectedFailureMeta  # RuntimeError not raised
     @onlyNativeDeviceTypes
     def test_masked_scatter_mem_overlap(self, device):
@@ -4171,6 +4176,7 @@ def test_masked_scatter_mem_overlap(self, device):
             x.masked_scatter_(mask, src)
 
     # FIXME: convert to ErrorInputs
+    # (but have to extend ErrorInputs to handle inplace-only errors!)
     @onlyNativeDeviceTypes
     def test_scatter_mem_overlap(self, device):
         x = torch.rand((1,), device=device).expand((6,))
@@ -5030,47 +5036,6 @@ def test_pickle_gradscaler(self, device):
                 if lazy_init_scale:
                     self.assertEqual(b.scale(torch.tensor([4.0], dtype=torch.float32, device=device)), 12.0)
 
-    # FIXME: convert to ErrorInputs
-    @skipIfMps
-    def test_multinomial_invalid(self, device):
-        def test(probs):
-            with self.assertRaisesRegex(RuntimeError,
-                                        'probability tensor contains either `inf`, `nan` or element < 0'):
-                out = torch.multinomial(probs.to(device), 2)
-                if out.is_cuda:
-                    torch.cuda.synchronize()
-
-        test(torch.tensor([1., -1., 1.]))
-        test(torch.tensor([1., inf, 1.]))
-        test(torch.tensor([1., -inf, 1.]))
-        test(torch.tensor([1., 1., nan]))
-
-    # FIXME: convert to ErrorInputs
-    @skipIfMps
-    def test_multinomial_invalid_distribution(self, device):
-        def test(probs, replacement):
-            with self.assertRaisesRegex(RuntimeError,
-                                        r"invalid multinomial distribution \(sum of probabilities <= 0\)"):
-                out = torch.multinomial(probs, 2, replacement)
-                if out.is_cuda:
-                    torch.cuda.synchronize()
-
-        x = torch.zeros(3, device=device)
-        y = torch.zeros(3, 3, device=device)
-        z = torch.zeros(3, 3, device=device)
-        z[1, :] = 1
-
-        test(x, False)
-        test(y, False)
-        test(z, False)
-
-        # Verify only for CPU as replacement=True
-        # throws device side assert triggered.
-        if self.device_type == 'cpu':
-            test(x, True)
-            test(y, True)
-            test(z, True)
-
     # FIXME: move to test distributions
     def _test_multinomial_empty(self, device, replacement, num_samples):
         probs = torch.ones(0, 3, device=device)
@@ -5701,6 +5666,15 @@ def test_unflatten(self):
                                     r"the unspecified dimension size -1 can be any value and is ambiguous"):
             torch.randn(2, 0).unflatten(1, (2, -1, 0))
 
+    def test_pytorch_library_disabled_env(self):
+        import subprocess
+        env = os.environ.copy()
+        env['PYTORCH_DISABLE_LIBRARY'] = '1'
+        try:
+            subprocess.check_output([sys.executable, '-c', 'import torch'], env=env)
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError("Could not 'import torch' with PYTORCH_DISABLE_LIBRARY=0") from e
+
     def test_structseq_repr(self):
         a = torch.arange(250).reshape(5, 5, 10)
         expected = """
@@ -6195,7 +6169,6 @@ def test_parsing_double(self):
         self.assertRaises(TypeError,
                           lambda: torch.isclose(x, x, torch.tensor(1.5), torch.tensor(1., requires_grad=True)).all())
 
-    @skipIfTorchDynamo("requires https://github.com/pytorch/pytorch/pull/83567")
     def test_parsing_intlist(self):
         #  parse with integer variables
         self.assertEqual(torch.Size([3, 4]), torch.ones((torch.tensor(3), torch.tensor(4))).shape)
@@ -6888,7 +6861,6 @@ def test_Size(self):
         self.assertIsInstance(x[:-1], torch.Size)
         self.assertIsInstance(x + x, torch.Size)
 
-    @skipIfTorchDynamo("Waiting on https://github.com/pytorch/pytorch/pull/83567")
     def test_Size_scalar(self):
         three = torch.tensor(3)
         two = torch.tensor(2)
diff --git a/test/test_transformers.py b/test/test_transformers.py
index 49217328c37e..974b2168554e 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -89,7 +89,8 @@ def test_transformerencoderlayer_src_mask(self, device, nhead):
     @parametrize("device", device_list)
     @parametrize("use_torchscript", [False])
     @parametrize("enable_nested_tensor", [True, False])
-    def test_transformerencoder_fastpath(self, device, use_torchscript, enable_nested_tensor):
+    @parametrize("use_autocast", [True, False])
+    def test_transformerencoder_fastpath(self, device, use_torchscript, enable_nested_tensor, use_autocast):
         """
         Test TransformerEncoder fastpath output matches slowpath output
         """
@@ -159,23 +160,25 @@ def test_transformerencoder_fastpath(self, device, use_torchscript, enable_neste
             ) for pair in input_mask_pairs
         ]
 
-        for input, src_key_padding_mask in input_mask_pairs:
-            with torch.no_grad():
-                fastpath_output = model(input, src_key_padding_mask=src_key_padding_mask)
-            slowpath_output = model(input, src_key_padding_mask=src_key_padding_mask)  # reference
-
-            # Make sure fastpath_output is same shape as slowpath_output and mask.
-            # When enable_nested_tensor=true, fastpath_output may be smaller than input tensor.
-            # Eg if input bs=1, seqlen=6, and we mask out 2 tokens, fastpath_output will have bs=1, seqlen=4.
-            # Expand back to old size to match.
-            bs, true_seqlen, embed_dim = fastpath_output.shape
-            expanded_seqlen = src_key_padding_mask.shape[1]
-            fastpath_output_expanded = torch.zeros(bs, expanded_seqlen, embed_dim, device=device)
-            fastpath_output_expanded[:, :true_seqlen, :] = fastpath_output
-            # no garauntees on output corresponding to masked tokens, so they may vary between slow/fast path. set all to 0.
-            fastpath_output_expanded = fastpath_output_expanded.masked_fill(src_key_padding_mask.unsqueeze(-1), 0)
-            slowpath_output = slowpath_output.masked_fill(src_key_padding_mask.unsqueeze(-1), 0)
-            torch.testing.assert_close(fastpath_output_expanded, slowpath_output, rtol=1e-7, atol=1e-5)
+        maybe_autocast = torch.autocast("cuda", dtype=torch.float16) if use_autocast else contextlib.nullcontext()
+        with maybe_autocast:
+            for input, src_key_padding_mask in input_mask_pairs:
+                with torch.no_grad():
+                    fastpath_output = model(input, src_key_padding_mask=src_key_padding_mask)
+                slowpath_output = model(input, src_key_padding_mask=src_key_padding_mask)  # reference
+
+                # Make sure fastpath_output is same shape as slowpath_output and mask.
+                # When enable_nested_tensor=true, fastpath_output may be smaller than input tensor.
+                # Eg if input bs=1, seqlen=6, and we mask out 2 tokens, fastpath_output will have bs=1, seqlen=4.
+                # Expand back to old size to match.
+                bs, true_seqlen, embed_dim = fastpath_output.shape
+                expanded_seqlen = src_key_padding_mask.shape[1]
+                fastpath_output_expanded = torch.zeros(bs, expanded_seqlen, embed_dim, device=device)
+                fastpath_output_expanded[:, :true_seqlen, :] = fastpath_output
+                # no garauntees on output corresponding to masked tokens, so they may vary between slow/fast path. set all to 0.
+                fastpath_output_expanded = fastpath_output_expanded.masked_fill(src_key_padding_mask.unsqueeze(-1), 0)
+                slowpath_output = slowpath_output.masked_fill(src_key_padding_mask.unsqueeze(-1), 0)
+                torch.testing.assert_close(fastpath_output_expanded, slowpath_output, rtol=1e-7, atol=1e-5)
 
     @parametrize("with_no_grad", [True, False])
     @parametrize("training", [True, False])
diff --git a/test/test_utils.py b/test/test_utils.py
index f81c9112f376..3fe597b6826f 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -423,6 +423,7 @@ def test_fn(x):
 
 class TestDataLoaderUtils(TestCase):
     def setUp(self):
+        super().setUp()
         self.dataset = torch.randn(5, 3, 3, 2)
         self.batch_size = 3
 
diff --git a/third_party/cudnn_frontend b/third_party/cudnn_frontend
index 43709ab96c47..171a7a986f7f 160000
--- a/third_party/cudnn_frontend
+++ b/third_party/cudnn_frontend
@@ -1 +1 @@
-Subproject commit 43709ab96c47e26eebcdac72f93f946d44ceffa8
+Subproject commit 171a7a986f7fbd9ed71bd0cf3c7ad4f55843d6b3
diff --git a/third_party/cutlass b/third_party/cutlass
new file mode 160000
index 000000000000..b72cbf957df8
--- /dev/null
+++ b/third_party/cutlass
@@ -0,0 +1 @@
+Subproject commit b72cbf957df8cf84a6d0ff91c190ad51a9c1d24a
diff --git a/tools/BUCK.bzl b/tools/BUCK.bzl
index 2ca59eec5ff6..e61ab02e48a2 100644
--- a/tools/BUCK.bzl
+++ b/tools/BUCK.bzl
@@ -129,6 +129,7 @@ def define_tools_targets(
             "autograd/templates/python_functions.cpp",
             "autograd/templates/python_functions.h",
             "autograd/templates/python_linalg_functions.cpp",
+            "autograd/templates/python_nested_functions.cpp",
             "autograd/templates/python_nn_functions.cpp",
             "autograd/templates/python_return_types.cpp",
             "autograd/templates/python_sparse_functions.cpp",
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 5a8bf46319f0..b2242a4b65c9 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -582,9 +582,6 @@
   grad_output: "native_dropout_double_backward(grad, grad_output, mask, scale)"
   mask: 'not_implemented("native_dropout_backward: mask")'
 
-- name: eig(Tensor self, bool eigenvectors=False) -> (Tensor eigenvalues, Tensor eigenvectors)
-  self: eig_backward(grads, self, eigenvectors, eigenvalues, eigenvectors_return)
-
 - name: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   self: zeros_like(self)
   result: self_t.zero_()
@@ -1785,7 +1782,11 @@
   self: not_implemented("_standard_gamma_grad")
 
 - name: values(Tensor(a) self) -> Tensor(a)
-  self: at::_sparse_coo_tensor_unsafe(self.indices(), grad, self.sizes())._coalesced_(true)
+  dispatch:
+    Default:
+      self: at::_sparse_coo_tensor_unsafe(self.indices(), grad, self.sizes())._coalesced_(true)
+    AutogradNestedTensor:
+      self: at::_nested_view_from_buffer(grad.contiguous(), self._nested_tensor_size(), self._nested_tensor_strides(), self._nested_tensor_offsets())
 
 # Why is _values() not differentiable?
 # See NOTE [ Sparse: autograd and API ]
@@ -1956,7 +1957,20 @@
 
 - name: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer)
   self: log_sigmoid_backward(grad, self, buffer)
-  output: auto_element_wise
+  # HACK: This is just auto_element_wise followed by a view_as. The reason we have
+  # this is bc forward AD was complaining here about the shapes not being the same:
+  # the primal/tangent are 0-D/1-D respectively. This started happening after moving the
+  # jvp decomposition mechanism from functorch to core, possibly due to a batching rule.
+  # In functorch we rely on OP_DECOMPOSE, but now we compute forward AD using an actual
+  # formula.
+  #
+  # We'd like to avoid keeping the entire jvp decomposition mechanism in functorch,
+  # just for this single decomposition, but also want to avoid any cases from regressing:
+  # e.g. test_vmapjvpall_nn_functional_logsigmoid_cuda_float32 (passes on cpu, fails on CUDA).
+  #
+  # We should either figure out what is going on with vmap or perhaps fwd AD could
+  # be more tolerant about 0-dim vs 1-dim tensors
+  output: log_sigmoid_backward(self_t.conj(), self_p, buffer).conj().view_as(self_p)
 
 - name: _log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
   self: _log_softmax_backward_data(grad, result, dim, self.scalar_type())
@@ -2698,17 +2712,22 @@
   list: "grad.defined()? at::unbind(grad) : std::vector<Tensor>(list.size())"
 
 - name: _nested_tensor_from_mask(Tensor t, Tensor mask, bool mask_check=True) -> Tensor
-  t: grad.to_padded_tensor(0, t.sizes())
+  t: nested_to_padded_tensor(grad, 0, t.sizes())
   mask: non_differentiable
 
 - name: _nested_from_padded(Tensor padded, Tensor cpu_nested_shape_example, bool fuse_transform_0213=False) -> Tensor
   padded: _nested_from_padded_backward(grad, padded, fuse_transform_0213)
   cpu_nested_shape_example: non_differentiable
 
-- name: to_padded_tensor(Tensor self, float padding, int[]? output_size=None) -> Tensor
+- name: nested_to_padded_tensor(Tensor self, float padding, int[]? output_size=None) -> Tensor
   self: at::_nested_from_padded(grad, self._nested_tensor_size())
   padding: non_differentiable
 
+- name:  _nested_view_from_buffer(Tensor(a) self, Tensor nested_size, Tensor nested_strides, int[] offsets) -> Tensor(a)
+  self: grad.values()
+  nested_size: non_differentiable
+  nested_strides: non_differentiable
+
 # fft
 - name: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor
   self: fft_r2c_backward(grad, dim, normalization, onesided, self.size(dim.back()))
@@ -2830,6 +2849,7 @@
 - name: scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor
   self, src: scatter_reduce_backward(grad, self, dim, index, src, reduce, include_self, result)
   index: non_differentiable
+  result: scatter_reduce_jvp(self_p, self_t, dim, index, src_p, src_t, reduce, include_self, result)
 
 - name: special_airy_ai(Tensor x) -> Tensor
   x: non_differentiable
diff --git a/tools/autograd/gen_autograd_functions.py b/tools/autograd/gen_autograd_functions.py
index d21dd439d307..e21b60152a59 100644
--- a/tools/autograd/gen_autograd_functions.py
+++ b/tools/autograd/gen_autograd_functions.py
@@ -400,7 +400,9 @@ def gen_autograd_functions_lib(
             fname,
             fname,
             lambda: {
-                "generated_comment": "@" + f"generated from {fm.template_dir}/" + fname,
+                "generated_comment": "@"
+                + f"generated from {fm.template_dir_for_comments()}/"
+                + fname,
                 "autograd_function_declarations": declarations,
                 "autograd_function_definitions": definitions,
             },
@@ -418,7 +420,7 @@ def gen_autograd_functions_python(
     fm.write(
         "python_functions.h",
         lambda: {
-            "generated_comment": f"@generated from {fm.template_dir}/python_functions.h",
+            "generated_comment": f"@generated from {fm.template_dir_for_comments()}/python_functions.h",
             "shard_forward_declare": [
                 f"void initialize_autogenerated_functions_{i}();"
                 for i in range(num_shards)
@@ -437,7 +439,7 @@ def gen_autograd_functions_python(
         infos,
         key_fn=lambda info: info.name,
         base_env={
-            "generated_comment": f"@generated from {fm.template_dir}/python_functions.cpp",
+            "generated_comment": f"@generated from {fm.template_dir_for_comments()}/python_functions.cpp",
         },
         env_callable=lambda info: {
             "py_function_initializers": [
diff --git a/tools/autograd/gen_inplace_or_view_type.py b/tools/autograd/gen_inplace_or_view_type.py
index acfd2ac796c4..ce8e27bbe3a4 100644
--- a/tools/autograd/gen_inplace_or_view_type.py
+++ b/tools/autograd/gen_inplace_or_view_type.py
@@ -16,6 +16,7 @@
     BaseCType,
     Binding,
     boolT,
+    ConstRefCType,
     CType,
     DispatcherSignature,
     intArrayRefT,
@@ -23,6 +24,8 @@
     OptionalCType,
     symIntArrayRefT,
     SymIntT,
+    # See Note [Nested Arg Types]
+    tensorT,
 )
 from torchgen.code_template import CodeTemplate
 from torchgen.context import with_native_function
@@ -56,6 +59,7 @@
     "view_as_real",
     "_conj",
     "_neg_view",
+    "_nested_view_from_buffer",
 ]
 
 VIEW_FUNCTIONS = {
@@ -327,6 +331,7 @@ def emit_view_lambda(f: NativeFunction, unpacked_bindings: List[Binding]) -> str
         BaseCType(boolT),
         BaseCType(intArrayRefT),
         BaseCType(symIntArrayRefT),
+        ConstRefCType(BaseCType(tensorT)),
     ]
     for unpacked_binding in unpacked_bindings:
         arg, arg_type = unpacked_binding.name, unpacked_binding.nctype.type
@@ -341,7 +346,6 @@ def emit_view_lambda(f: NativeFunction, unpacked_bindings: List[Binding]) -> str
                 "over by value, also add a test in pytorch/xla/test/test_operations.py where this code "
                 "is exercised."
             )
-
         if arg_type == BaseCType(intArrayRefT) or arg_type == BaseCType(
             symIntArrayRefT
         ):
@@ -357,6 +361,13 @@ def emit_view_lambda(f: NativeFunction, unpacked_bindings: List[Binding]) -> str
                 arg=arg, val=arg_value, default="0"
             )
             updated_unpacked_args.append(arg_value)
+        elif (
+            arg == "nested_size_" or arg == "nested_strides_"
+        ) and arg_type == ConstRefCType(BaseCType(tensorT)):
+            # [NOTE] [Nested Arg Types]
+            # This is temporary. Nested tensors will be migrating to use SymInts and
+            # nested_size and nested_strides will no longer be tensors.
+            updated_unpacked_args.append(arg[:-1])
         else:
             updated_unpacked_args.append(arg)
 
@@ -584,7 +595,8 @@ def gen_inplace_or_view_type(
         [fn for fn in fns_with_infos if use_derived(fn)],
         key_fn=lambda fn: fn.func.root_name,
         base_env={
-            "generated_comment": f"@generated from {template_path}/ADInplaceOrViewType.cpp",
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/ADInplaceOrViewType.cpp",
         },
         env_callable=gen_inplace_or_view_type_env,
         num_shards=2,
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 1c476300ad42..194d6e91ca57 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -1,7 +1,8 @@
 # Generates Python bindings for ATen functions
 #
 # The bindings are generated as methods on python_variable or functions on the
-# torch._C._nn. torch._C._fft, torch._C._linalg, torch._C._sparse or torch._C._special objects.
+# torch._C._nn. torch._C._fft, torch._C._linalg, torch._C._nested, torch._C._sparse
+# or torch._C._special objects.
 #
 
 # Code tries to stick to the following rules:
@@ -151,6 +152,10 @@
     "fill.Scalar",  # only used by the functionalization pass
     "lift.*",
     "normal_functional",  # only used by the functionalization pas
+    "_nested_tensor_strides",  # don't want to expose this to python
+    "_nested_tensor_offsets",  # don't want to expose this to python
+    "_nested_view_from_buffer",  # View only version of _nested_from_buffer. This will force users to only use the "safe" version.
+    "_nested_view_from_buffer_copy",
 ]
 
 SKIP_PYTHON_BINDINGS = list(
@@ -217,6 +222,10 @@ def is_py_linalg_function(f: NativeFunction) -> bool:
     return f.python_module == "linalg"
 
 
+def is_py_nested_function(f: NativeFunction) -> bool:
+    return f.python_module == "nested"
+
+
 def is_py_sparse_function(f: NativeFunction) -> bool:
     return f.python_module == "sparse"
 
@@ -302,6 +311,15 @@ def gen(
         symint=symint,
     )
 
+    create_python_bindings(
+        fm,
+        functions,
+        is_py_nested_function,
+        "torch.nested",
+        "python_nested_functions.cpp",
+        method=False,
+    )
+
     create_python_bindings(
         fm,
         functions,
@@ -386,7 +404,8 @@ def create_python_bindings(
         filename,
         filename,
         lambda: {
-            "generated_comment": "@" + f"generated from {fm.template_dir}/{filename}",
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/{filename}",
             "ops_headers": ops_headers,
             "py_forwards": py_forwards,
             "py_methods": py_methods,
@@ -424,7 +443,8 @@ def create_python_return_type_bindings(
         filename,
         filename,
         lambda: {
-            "generated_comment": "@" + f"generated from {fm.template_dir}/{filename}",
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/{filename}",
             "py_return_types": py_return_types_definition,
             "py_return_types_map": py_return_types_map,
         },
@@ -467,7 +487,8 @@ def env_func(
         filename,
         grouped.items(),
         base_env={
-            "generated_comment": "@" + f"generated from {fm.template_dir}/{filename}",
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/{filename}",
         },
         key_fn=key_func,
         env_callable=env_func,
@@ -873,6 +894,7 @@ def gen_has_torch_function_check(
             "torch.nn": "THPNNVariableFunctionsModule",
             "torch.fft": "THPFFTVariableFunctionsModule",
             "torch.linalg": "THPLinalgVariableFunctionsModule",
+            "torch.nested": "THPNestedVariableFunctionsModule",
             "torch.sparse": "THPSparseVariableFunctionsModule",
             "torch.special": "THPSpecialVariableFunctionsModule",
         }[module]
diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py
index 21739bb80510..45796d8ffa47 100644
--- a/tools/autograd/gen_trace_type.py
+++ b/tools/autograd/gen_trace_type.py
@@ -535,7 +535,8 @@ def gen_trace_type(
         [fn for fn in native_functions if cpp.name(fn.func) not in MANUAL_TRACER],
         key_fn=lambda fn: fn.root_name,
         base_env={
-            "generated_comment": f"@generated from {template_path}/TraceType.cpp",
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/TraceType.cpp",
         },
         env_callable=gen_trace_type_func,
         num_shards=5,
diff --git a/tools/autograd/gen_variable_factories.py b/tools/autograd/gen_variable_factories.py
index 88356bd7234d..c1708d4b65c1 100644
--- a/tools/autograd/gen_variable_factories.py
+++ b/tools/autograd/gen_variable_factories.py
@@ -48,7 +48,7 @@ def gen_variable_factories(
         "variable_factories.h",
         lambda: {
             "generated_comment": "@"
-            + f"generated from {fm.template_dir}/variable_factories.h",
+            + f"generated from {fm.template_dir_for_comments()}/variable_factories.h",
             "ops_headers": [
                 f"#include <ATen/ops/{fn.root_name}.h>" for fn in factory_functions
             ],
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index f9afe838203d..313503886587 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -31,6 +31,7 @@
 from torchgen.api.autograd import (
     DifferentiableInput,
     dispatch_strategy,
+    ForwardDerivative,
     gen_differentiable_outputs,
     is_differentiable,
     NativeFunctionWithDifferentiabilityInfo,
@@ -159,6 +160,7 @@
     "logical_or",
     # This function returns nested_tensor shape as a tensor that is non-differentiable
     "_nested_tensor_size",
+    "_nested_tensor_strides",
 }
 
 # The C -> R functions at the time of adding this are still being audited and tested
@@ -537,6 +539,7 @@
     # Nested Tensors related functions
     # _nested_tensor_size() should never actually be called with requires_grad=True tensor
     "_nested_tensor_size",
+    "_nested_tensor_strides",
 }
 
 DONT_ENFORCE_STORAGE_IMPL_USE_COUNT = {
@@ -597,8 +600,14 @@
 DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES = CodeTemplate(
     """\
 auto ${tmp_var} = ([&]() {
-  ${guard}
-  return ${base_type_call};
+  if (${try_jit_decomposition_bool} && ${any_has_forward_grad}) {
+    static c10::OperatorName full_name("aten::${op_name}", "${op_overload}");
+    static c10::optional<c10::OperatorHandle> opt_op = c10::Dispatcher::singleton().findSchema(full_name);
+    return impl::run_jit_decomposition_with_args_for_jvp<${return_types}>("${op_name}", *opt_op, ks, ${arg_names});
+  } else {
+    ${guard}
+    return ${base_type_call};
+  }
 })();
 """
 )
@@ -642,6 +651,12 @@
 """
 )
 
+FW_DERIVATIVE_TENSORLIST_CHECK_TEMPLATE = CodeTemplate(
+    """\
+isFwGradDefinedTensorList(${req_inp})\
+"""
+)
+
 FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE = CodeTemplate(
     """\
 auto ${inp}_t_raw = toNonOptFwGrad(${inp});
@@ -734,7 +749,8 @@ def gen_variable_type(
     fm.write(
         "VariableType.h",
         lambda: {
-            "generated_comment": "@" f"generated from {template_path}/VariableType.h"
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/VariableType.h"
         },
     )
 
@@ -788,7 +804,8 @@ def wrapper_registrations(used_keys: Set[str]) -> str:
         [fn for fn in fns_with_diff_infos if use_derived(fn)],
         key_fn=lambda fn: cpp.name(fn.func.func),
         base_env={
-            "generated_comment": "@" f"generated from {template_path}/VariableType.cpp",
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/VariableType.cpp",
         },
         env_callable=gen_variable_type_func,
         num_shards=5,
@@ -972,6 +989,23 @@ def find_args_with_derivatives(
             f"ERROR: derivative ignored for {name} -- specified an autograd function without derivative"
         )
 
+    if requires_derivative and not len(fw_derivatives) == 0:
+        assert sum(len(derivative.var_names) for derivative in fw_derivatives) == len(
+            differentiable_outputs
+        ), (
+            "Expected the number of forward derivatives implemented to match the "
+            "number of differentiable outputs. NB: This only applies when at least "
+            "one forward derivative is implemented. Not implementing any forward "
+            "derivatives is also okay, and we would require inputs to the op to "
+            "not have associated tangents in that case."
+        )
+    try_jit_decomposition = (
+        requires_derivative
+        and len(fw_derivatives) == 0
+        and (not modifies_arguments(f))
+        and (not returns_void)
+    )
+
     def emit_save_inputs() -> List[str]:
         setup: List[str] = []
         if info is None or not info.has_derivatives:
@@ -1338,7 +1372,9 @@ def check_tensorimpl_and_storage(
             )
         return call
 
-    def emit_call(f: NativeFunction, unpacked_bindings: List[Binding]) -> str:
+    def emit_call(
+        f: NativeFunction, unpacked_bindings: List[Binding], try_jit_decomposition: bool
+    ) -> str:
         # We only care about adding `at::AutoDispatchBelowAutograd` guard for non-variable dispatch
         # (which corresponds to 'use_derived' strategy). The purpose of this guard is to make sure
         # the baseType operations still dispatch to non-Variable type, even if the arguments passed
@@ -1352,13 +1388,47 @@ def emit_call(f: NativeFunction, unpacked_bindings: List[Binding]) -> str:
         else:
             guard = "at::AutoDispatchBelowADInplaceOrView guard;"
 
+        try_jit_decomposition_bool = "true" if try_jit_decomposition else "false"
+        any_has_forward_grad = (
+            get_any_has_fw_grad_cond(derivative=None)
+            if requires_derivative
+            else "false"
+        )
+        return_types = ", ".join(
+            [cpp.return_type(a, symint=True).cpp_type() for a in f.func.returns]
+        )
+        if len(f.func.returns) > 1:
+            return_types = f"std::tuple<{return_types}>"
+
+        arg_names = [
+            a.name
+            for a in cpp.arguments(
+                f.func.arguments,
+                faithful=True,
+                symint=True,
+                method=False,
+                cpp_no_default_args=set(),
+            )
+        ]
+
         if not modifies_arguments(f) and not returns_void:
+            # Just to keep things simple here, we only care about this path
+            # and always emit the if/else for now
             call = DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES.substitute(
-                base_type_call=base_type_call, tmp_var=TMP_VAR, guard=guard
+                base_type_call=base_type_call,
+                tmp_var=TMP_VAR,
+                guard=guard,
+                try_jit_decomposition_bool=try_jit_decomposition_bool,
+                any_has_forward_grad=any_has_forward_grad,
+                op_name=cpp.name(f.func),
+                op_overload=f.func.name.overload_name,
+                return_types=return_types,
+                arg_names=arg_names,
             )
 
             call += wrap_output(f, unpacked_bindings, TMP_VAR)
         else:
+            assert not try_jit_decomposition
             call = DISPATCH_TO_NON_VAR_TYPE_WITHOUT_RETURN_VALUES.substitute(
                 base_type_call=base_type_call, guard=guard
             )
@@ -1406,38 +1476,14 @@ def get_any_has_forward_grad_name(var_names: Tuple[str, ...]) -> str:
     def emit_any_has_forward_grad() -> List[str]:
         content: List[str] = []
         for derivative in fw_derivatives:
-            assert derivative.required_inputs_fw_grad is not None
-            requires_fw_grad = " || ".join(
-                [
-                    FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name)
-                    for inp in differentiable_inputs
-                    if inp.name in derivative.required_inputs_fw_grad
-                ]
-            )
-            if not requires_fw_grad:
-                # Handle functions like stack
-                # For these, we don't unpack anything and always call the user function
-                if not (
-                    len(differentiable_inputs) == 1
-                    and is_tensor_list_type(differentiable_inputs[0].type)
-                ):
-                    raise RuntimeError(
-                        f'No differentiable input to "{name}" is a differentiable Tensor (as the provided '
-                        "forward AD formula does not use any input tangent) even though a forward gradient "
-                        "formula has been defined for it. This case should only happen for function that "
-                        "take a single TensorList as input. All other cases are not supported right now."
-                    )
-                requires_fw_grad = "true"
-
+            requires_fw_grad = get_any_has_fw_grad_cond(derivative=derivative)
             if info and info.output_differentiability_conditions:
                 assert len(info.output_differentiability_conditions) == 1
-                requires_fw_grad = f"({info.output_differentiability_conditions[0]}) && ({requires_fw_grad})"
-
+                requires_fw_grad = f"({info.output_differentiability_conditions[0]}) && {requires_fw_grad}"
             content.append(
                 f"auto {get_any_has_forward_grad_name(derivative.var_names)} = {requires_fw_grad};\n"
                 f"(void){get_any_has_forward_grad_name(derivative.var_names)};"
             )
-
         return content
 
     def emit_check_inplace() -> List[str]:
@@ -1560,46 +1606,83 @@ def emit_fw_derivatives() -> List[str]:
         content.append("\n".join(fw_grad_setters))
         return content
 
-    def emit_forbid_fw_derivatives(is_out_fn: bool = False) -> str:
-        def get_msg() -> str:
-            if is_out_fn:
-                msg = "because it is an out= function"
-            else:
-                msg = (
-                    "because it has not been implemented yet.\\nPlease file an issue "
-                    "to PyTorch at https://github.com/pytorch/pytorch/issues/new?template=feature-request.yml "
-                    "so that we can prioritize its implementation."
-                )
-            return msg
-
-        res = ""
-        to_check: List[str] = []
-        for inp in list(
-            mapMaybe(
-                gen_differentiable_input,
-                f.func.arguments.non_out + list(f.func.arguments.out),  # type: ignore[operator]
-            )
-        ):
-            if is_tensor_type(inp.type):
-                to_check.append(
-                    FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name)
-                )
-            elif is_tensor_list_type(inp.type):
-                cond = FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp="_t")
-                res += FW_DERIVATIVE_FORBID_LIST_TEMPLATE.substitute(
-                    arg=inp.name, cond=cond, name=name, msg=get_msg()
+    def get_any_has_fw_grad_cond(derivative: Optional[ForwardDerivative]) -> str:
+        #
+        # Produces a condition string (e.g, "isFwGradDefined(grad_output) || isFwGradDefined(output)")
+        #
+        if derivative is None:
+            # (1) If a derivative is NOT provided, cond will check fw_grad of ALL differentiable inputs
+            # - Used in the out_fn case when we want to forbid fw derivatives
+            # - Used in the case where the fw_derivative is not defined, but we want
+            #   To check if there is a decomposition registered for jvp
+            to_check: List[str] = []
+            for inp in list(
+                mapMaybe(
+                    gen_differentiable_input,
+                    f.func.arguments.non_out + list(f.func.arguments.out),  # type: ignore[operator]
                 )
+            ):
+                if is_tensor_type(inp.type):
+                    to_check.append(
+                        FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name)
+                    )
+                elif is_tensor_list_type(inp.type):
+                    to_check.append(
+                        FW_DERIVATIVE_TENSORLIST_CHECK_TEMPLATE.substitute(
+                            req_inp=inp.name
+                        )
+                    )
+                else:
+                    raise RuntimeError(
+                        f'Unsupported input type for "{name}" when forbidding forward AD usage.'
+                    )
+            return f'({" || ".join(to_check)})'
+        else:
+            # (2) If derivative is provided, use that information to determine which inputs
+            #     to check fw_grad for
+            assert derivative.required_inputs_fw_grad is not None
+
+            if len(derivative.required_inputs_fw_grad) == 0:
+                # Handle functions like stack
+                # For these, we don't unpack anything and always call the user function
+                if not (
+                    len(differentiable_inputs) == 1
+                    and is_tensor_list_type(differentiable_inputs[0].type)
+                ):
+                    raise RuntimeError(
+                        f'No differentiable input to "{name}" is a differentiable Tensor (as the provided '
+                        "forward AD formula does not use any input tangent) even though a forward gradient "
+                        "formula has been defined for it. This case should only happen for function that "
+                        "take a single TensorList as input. All other cases are not supported right now."
+                    )
+                any_has_fw_grad = "true"
             else:
-                raise RuntimeError(
-                    f'Unsupported input type for "{name}" when forbidding forward AD usage.'
+                any_has_fw_grad = " || ".join(
+                    [
+                        FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name)
+                        for inp in differentiable_inputs
+                        if inp.name in derivative.required_inputs_fw_grad
+                    ]
                 )
+                any_has_fw_grad = f"({any_has_fw_grad})"
 
-        if len(to_check) > 0:
-            cond = " || ".join(to_check)
-            res += FW_DERIVATIVE_FORBID_TEMPLATE.substitute(
-                cond=cond, name=name, msg=get_msg()
+            return any_has_fw_grad
+
+    def emit_forbid_fw_derivatives(is_out_fn: bool = False) -> str:
+        if is_out_fn:
+            msg = "because it is an out= function"
+        else:
+            msg = (
+                "because it has not been implemented yet.\\nPlease file an issue "
+                "to PyTorch at https://github.com/pytorch/pytorch/issues/new?template=feature-request.yml "
+                "so that we can prioritize its implementation."
             )
-        return res
+        cond = get_any_has_fw_grad_cond(derivative=None)
+        return (
+            FW_DERIVATIVE_FORBID_TEMPLATE.substitute(cond=cond, name=name, msg=msg)
+            if cond != ""
+            else ""
+        )
 
     body: List[str] = []
     unpack_args_stats, unpacked_bindings = unpack_args(f)
@@ -1613,7 +1696,7 @@ def get_msg() -> str:
         body.extend(setup_derivative(differentiable_inputs))
     body.append(declare_returned_variables(f))
 
-    body.append(emit_call(f, unpacked_bindings))
+    body.append(emit_call(f, unpacked_bindings, try_jit_decomposition))
     if requires_derivative:
         # set_flags has to appear after version_counter, because rebase_history
         # requires that the counter is incremented before it is called
@@ -1623,20 +1706,11 @@ def get_msg() -> str:
     if is_out_fn:
         body.append(emit_forbid_fw_derivatives(is_out_fn=True))
     else:
-        if requires_derivative:
-            body.extend(emit_fw_derivatives())
-            if len(fw_derivatives) == 0:
-                body.append(emit_forbid_fw_derivatives())
+        if requires_derivative and not try_jit_decomposition:
+            if len(fw_derivatives) > 0:
+                body.extend(emit_fw_derivatives())
             else:
-                assert sum(
-                    len(derivative.var_names) for derivative in fw_derivatives
-                ) == len(differentiable_outputs), (
-                    "Expected the number of forward derivatives implemented to match the "
-                    "number of differentiable outputs. NB: This only applies when at least "
-                    "one forward derivative is implemented. Not implementing any forward "
-                    "derivatives is also okay, and we would require inputs to the op to "
-                    "not have associated tangents in that case."
-                )
+                body.append(emit_forbid_fw_derivatives())
 
     if requires_derivative:
         # Save only after the forward AD has been set up
diff --git a/tools/autograd/templates/python_nested_functions.cpp b/tools/autograd/templates/python_nested_functions.cpp
new file mode 100644
index 000000000000..cdfc4336163f
--- /dev/null
+++ b/tools/autograd/templates/python_nested_functions.cpp
@@ -0,0 +1,79 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_special_functions.h"
+#include "torch/csrc/autograd/python_return_types.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
+#include "torch/csrc/utils/out_types.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/cuda_lazy_init.h"
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Device;
+using at::Layout;
+using at::Scalar;
+using at::ScalarType;
+using at::Backend;
+using at::OptionalDeviceGuard;
+using at::DeviceGuard;
+using at::TensorOptions;
+using at::IntArrayRef;
+using at::OptionalIntArrayRef;
+using at::Generator;
+using at::TensorList;
+using at::Dimname;
+using at::DimnameList;
+
+using namespace torch::autograd::utils;
+
+namespace torch { namespace autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef nested_functions[] = {
+  ${py_method_defs}
+  {NULL}
+};
+
+static PyObject* THPNestedVariableFunctionsModule = NULL;
+
+void initNestedFunctions(PyObject* module) {
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._nested",
+     NULL,
+     -1,
+     nested_functions
+  };
+  PyObject* nested = PyModule_Create(&def);
+  THPNestedVariableFunctionsModule = nested;
+  if (!nested) {
+    throw python_error();
+  }
+  // steals a reference to nested
+  if (PyModule_AddObject(module, "_nested", nested) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+}} // namespace torch::autograd
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index 0ad6f889966f..2ad183c9ace8 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -213,7 +213,7 @@ static PyObject * THPVariable_storage_offset(PyObject* self_, PyObject* args)
     return handle_torch_function(self_, "storage_offset");
   }
   auto& self = THPVariable_Unpack(self_);
-  return wrap(self.storage_offset());
+  return py::cast(self.sym_storage_offset()).release().ptr();
   END_HANDLE_TH_ERRORS
 }
 
diff --git a/tools/onnx/update_default_opset_version.py b/tools/onnx/update_default_opset_version.py
index 2086a26c9696..9c4b0e099be8 100755
--- a/tools/onnx/update_default_opset_version.py
+++ b/tools/onnx/update_default_opset_version.py
@@ -84,7 +84,7 @@ def main(args: Any) -> None:
 
     read_sub_write(
         os.path.join("torch", "onnx", "_constants.py"),
-        r"(onnx_default_opset = )\d+",
+        r"(ONNX_DEFAULT_OPSET = )\d+",
         new_default,
     )
     read_sub_write(
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index 932f7a009852..dde48806d10d 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -10,7 +10,7 @@
 )
 from torchgen.gen import parse_native_yaml
 
-from torchgen.model import Variant
+from torchgen.model import DispatchKey, Variant
 from torchgen.utils import FileManager
 
 from tools.autograd.gen_python_functions import (
@@ -136,6 +136,9 @@ def should_bind_method(python_func: PythonSignatureNativeFunctionPair) -> bool:
     "floor_divide",
     "floor_divide_",
     "floor_divide_out",
+    "to",
+    "_to_copy",
+    "copy_",
 ]
 
 binary_ops = (
@@ -863,6 +866,10 @@ def gen_pyi(
     all_directive = pformat(all_symbols, width=100, compact=True).split("\n")
     all_directive[0] = "__all__ = {}".format(all_directive[0])
 
+    # Dispatch key hints
+    # ~~~~~~~~~~~~~~~~~~
+    dispatch_key_hints = [f"{d.name}: DispatchKey = ..." for d in DispatchKey]
+
     # Write out the stub
     # ~~~~~~~~~~~~~~~~~~
 
@@ -873,6 +880,7 @@ def gen_pyi(
         "legacy_class_hints": legacy_class_hints,
         "legacy_storage_base_hints": legacy_storage_base_hints,
         "dtype_class_hints": dtype_class_hints,
+        "dispatch_key_hints": dispatch_key_hints,
         "all_directive": all_directive,
     }
     fm.write_with_template(
diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
index 4f4e22e2e0ae..5ce3f3009b3c 100644
--- a/tools/setup_helpers/cmake.py
+++ b/tools/setup_helpers/cmake.py
@@ -230,6 +230,7 @@ def generate(
                     "OPENSSL_ROOT_DIR",
                     "STATIC_DISPATCH_BACKEND",
                     "SELECTED_OP_LIST",
+                    "TRACING_BASED",
                 )
             }
         )
diff --git a/tools/test/test_codegen.py b/tools/test/test_codegen.py
index 781dde46fe70..8bcecbb26e32 100644
--- a/tools/test/test_codegen.py
+++ b/tools/test/test_codegen.py
@@ -9,10 +9,13 @@
 import yaml
 
 from tools.autograd import gen_autograd_functions, load_derivatives
+from torchgen.api.types import CppSignatureGroup, DispatcherSignature
+from torchgen.context import native_function_manager
 from torchgen.gen import (
     get_native_function_declarations,
     get_native_function_schema_registrations,
     LineLoader,
+    static_dispatch,
 )
 from torchgen.model import (
     BackendIndex,
@@ -314,7 +317,6 @@ def setUp(self) -> None:
                 dispatch_key=k,
                 use_out_as_primary=True,
                 external=False,
-                symint=False,
                 device_guard=False,
                 index=backend_indices[k],
             )
@@ -409,6 +411,65 @@ def test_functional_variant_autogen_out_variant_two_returns(self) -> None:
         self.assertEqual(backend_metadata.kernel, "op_2_out")
 
 
+# Test for static_dispatch
+class TestStaticDispatchGeneratrion(unittest.TestCase):
+    def setUp(self) -> None:
+        self.backend_indices: Dict[
+            DispatchKey, Dict[OperatorName, BackendMetadata]
+        ] = defaultdict(dict)
+        yaml_entry = """
+- func: op.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: op
+        """
+        es = yaml.load(yaml_entry, Loader=LineLoader)
+        self.one_return_func, m = NativeFunction.from_yaml(
+            es[0], loc=Location(__file__, 1), valid_tags=set()
+        )
+
+        BackendIndex.grow_index(self.backend_indices, m)
+        dispatch_key = DispatchKey.CompositeExplicitAutograd
+        self.assertTrue(dispatch_key in self.backend_indices)
+        self.indices = [
+            BackendIndex(
+                dispatch_key=dispatch_key,
+                use_out_as_primary=True,
+                external=False,
+                device_guard=False,
+                index=self.backend_indices[dispatch_key],
+            )
+        ]
+
+    def test_op_with_1_backend_generates_static_dispatch(self) -> None:
+        disp_sig = DispatcherSignature.from_schema(self.one_return_func.func)
+        with native_function_manager(self.one_return_func):
+            out = static_dispatch(
+                sig=disp_sig,
+                f=self.one_return_func,
+                backend_indices=self.indices,
+            )
+        self.assertEqual(
+            out, "return at::compositeexplicitautograd::op_out(out, self);"
+        )
+
+    def test_op_with_cpp_sig_generates_static_dispatch(self) -> None:
+        sig_group = CppSignatureGroup.from_native_function(
+            self.one_return_func,
+            method=False,
+            fallback_binding=self.one_return_func.manual_cpp_binding,
+        )
+        # cpp signature puts out at the front
+        with native_function_manager(self.one_return_func):
+            out = static_dispatch(
+                sig=sig_group.signature,
+                f=self.one_return_func,
+                backend_indices=self.indices,
+            )
+        self.assertEqual(
+            out, "return at::compositeexplicitautograd::op_out(out, self);"
+        )
+
+
 # Represents the most basic NativeFunction. Use dataclasses.replace()
 # to edit for use.
 DEFAULT_NATIVE_FUNCTION, _ = torchgen.model.NativeFunction.from_yaml(
diff --git a/tools/test/test_gen_backend_stubs.py b/tools/test/test_gen_backend_stubs.py
index 9091cca6dddf..8d54b8ce04dd 100644
--- a/tools/test/test_gen_backend_stubs.py
+++ b/tools/test/test_gen_backend_stubs.py
@@ -3,6 +3,7 @@
 import os
 import tempfile
 import unittest
+from typing import Optional
 
 import expecttest
 from torchgen.gen import _GLOBAL_PARSE_NATIVE_YAML_CACHE  # noqa: F401
@@ -25,12 +26,20 @@ def assert_success_from_gen_backend_stubs(self, yaml_str: str) -> None:
             fp.flush()
             run(fp.name, "", True)
 
-    def get_errors_from_gen_backend_stubs(self, yaml_str: str) -> str:
+    def get_errors_from_gen_backend_stubs(
+        self, yaml_str: str, *, kernels_str: Optional[str] = None
+    ) -> str:
         with tempfile.NamedTemporaryFile(mode="w") as fp:
             fp.write(yaml_str)
             fp.flush()
             try:
-                run(fp.name, "", True)
+                if kernels_str is None:
+                    run(fp.name, "", True)
+                else:
+                    with tempfile.NamedTemporaryFile(mode="w") as kernel_file:
+                        kernel_file.write(kernels_str)
+                        kernel_file.flush()
+                        run(fp.name, "", True, impl_path=kernel_file.name)
             except AssertionError as e:
                 # Scrub out the temp file name from any error messages to simplify assertions.
                 return str(e).replace(fp.name, "")
@@ -238,7 +247,7 @@ def test_unrecognized_key(self) -> None:
         output_error = self.get_errors_from_gen_backend_stubs(yaml_str)
         self.assertExpectedInline(
             output_error,
-            """ contains unexpected keys: invalid_key. Only the following keys are supported: backend, class_name, cpp_namespace, extra_headers, supported, autograd, full_codegen, non_native, ir_gen""",  # noqa: B950
+            """ contains unexpected keys: invalid_key. Only the following keys are supported: backend, class_name, cpp_namespace, extra_headers, supported, autograd, full_codegen, non_native, ir_gen, symint""",  # noqa: B950
         )
 
     # if use_out_as_primary is provided, it must be a bool
@@ -269,6 +278,34 @@ def test_device_guard_non_bool(self) -> None:
             """You must provide either True or False for device_guard. Provided: frue""",
         )  # noqa: B950
 
+    def test_incorrect_kernel_name(self) -> None:
+        yaml_str = """\
+backend: XLA
+cpp_namespace: torch_xla
+supported:
+- abs
+autograd:
+- add.Tensor"""
+        # Codegen will expect two kernel names (and try to parse them with regex):
+        # XLANativeFunctions::abs(...)
+        # XLANativeFunctions::add(...)
+        kernels_str = """\
+at::Tensor& XLANativeFunctions::absWRONG(at::Tensor& self) {}
+at::Tensor& XLANativeFunctions::add(at::Tensor& self) {}"""
+        output_error = self.get_errors_from_gen_backend_stubs(
+            yaml_str, kernels_str=kernels_str
+        )
+        self.assertExpectedInline(
+            output_error,
+            """\
+
+XLANativeFunctions is missing a kernel definition for abs. We found 0 kernel(s) with that name,
+but expected 1 kernel(s). The expected function schemas for the missing operator are:
+at::Tensor abs(const at::Tensor & self)
+
+""",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 7586745602fd..7398ea3e5aaf 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -444,6 +444,11 @@ target_compile_options(torch_python PRIVATE ${TORCH_PYTHON_COMPILE_OPTIONS})
 
 target_include_directories(torch_python PUBLIC ${TORCH_PYTHON_INCLUDE_DIRECTORIES})
 
+if(USE_UCC)
+  target_link_libraries(torch_python PRIVATE __caffe2_ucc)
+  target_compile_definitions(torch_python PRIVATE USE_UCC)
+endif()
+
 if(BUILD_ONEDNN_GRAPH)
   target_compile_definitions(torch_python PRIVATE "-DBUILD_ONEDNN_GRAPH")
   target_compile_definitions(torch_cpu PRIVATE "-DBUILD_ONEDNN_GRAPH")
@@ -491,3 +496,6 @@ if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
   # Pybind11 requires explicit linking of the torch_python library
   target_link_libraries(nnapi_backend PRIVATE torch torch_python pybind::pybind11)
 endif()
+
+set(TORCH_PYTHON_COMPILE_OPTIONS ${TORCH_PYTHON_COMPILE_OPTIONS} PARENT_SCOPE)
+set(TORCH_PYTHON_LINK_FLAGS ${TORCH_PYTHON_LINK_FLAGS} PARENT_SCOPE)
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 6ec595fd299a..b9718a68bb91 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -12,7 +12,9 @@ from typing import (
 from typing_extensions import Literal
 from torch._six import inf
 
-from torch.types import _int, _float, _bool, _dtype, _device, _qscheme, _size, _layout, Device, Number, Storage, SymInt
+from torch.types import (
+    _int, _float, _bool, _dtype, _device, _qscheme, _size, _layout, Device, Number, Storage, SymInt, _dispatchkey
+)
 from torch.storage import TypedStorage
 
 import builtins
@@ -863,6 +865,7 @@ def _disabled_torch_dispatch_impl(func: Callable, types: Iterable[Type], args: T
 def _get_linalg_preferred_backend() -> torch._C._LinalgBackend: ...
 def _set_linalg_preferred_backend(arg: torch._C._LinalgBackend): ...
 def _is_mps_available() -> _bool: ...
+def _is_deploy_enabled() -> _bool: ...
 class _LinalgBackend:
     Default: _LinalgBackend
     Cusolver: _LinalgBackend
@@ -994,18 +997,79 @@ class Generator(object):
 
 
 # Defined in torch/csrc/utils/python_dispatch.cpp
-def _dispatch_library(kind: str, name: str, dispatch: str, file: str = "", linenum: Any = 0) -> Any: ...
-def _dispatch_has_kernel_for_dispatch_key(name: str, dispatch: str) -> _bool: ...
-def _dispatch_has_computed_kernel_for_dispatch_key(name: str, dispatch: str) -> _bool: ...
+
+class _DispatchOperatorHandle:
+    def schema(self) -> FunctionSchema: ...
+
+class _DispatchModule:
+    def def_(self, schema: str, alias: str = "") -> _DispatchModule: ...
+    def def_legacy(self, schema: str) -> _DispatchModule: ...
+    def def_name_t_t(self, name: str, dispatch: str, debug: str = "default_def_name_t_t") -> _DispatchModule: ...
+    def def_schema_t_t(self, schema: str, dispatch: str, alias: str, debug: str = "default_def_schema_t_t") -> _DispatchModule: ...
+    def impl_t_t(self, name: str, dispatch: str, debug: str = "impl_t_t") -> _DispatchModule: ...
+    def impl_tt_t(self, name: str, dispatch: str, debug: str = "impl_tt_t") -> _DispatchModule: ...
+    def impl(self, name: str, dispatch: str, func: Callable) -> _DispatchModule: ...
+    def define(self, schema: str, alias: str = "") -> _DispatchModule: ...
+    def fallback_fallthrough(self, dispatch: str = "") -> _DispatchModule: ...
+
+def _dispatch_library(kind: str, name: str, dispatch: str, file: str = "", linenum: Any = 0) -> _DispatchModule: ...
+def _dispatch_dump(name: str) -> str: ...
+def _dispatch_dump_table(name: str) -> str: ...
+def _dispatch_check_invariants(name: str) -> None: ...
+def _dispatch_check_all_invariants() -> None: ...
 def _dispatch_has_kernel(name: str) -> _bool: ...
-def _dispatch_tls_is_dispatch_key_excluded(dispatch: str) -> _bool: ...
-def _dispatch_tls_set_dispatch_key_excluded(dispatch: str, val: _bool) -> None: ...
+def _dispatch_has_kernel_for_dispatch_key(name: str, dispatch: _dispatchkey) -> _bool: ...
+def _dispatch_has_kernel_for_any_dispatch_key(name: str, dispatch_key_set: DispatchKeySet) -> _bool: ...
+def _dispatch_has_computed_kernel_for_dispatch_key(name: str, dispatch: _dispatchkey) -> _bool: ...
+def _dispatch_find_dangling_impls() -> List[str]: ...
+def _dispatch_tls_set_dispatch_key_excluded(dispatch: _dispatchkey, val: _bool) -> None: ...
+def _dispatch_tls_is_dispatch_key_excluded(dispatch: _dispatchkey) -> _bool: ...
 def _dispatch_isTensorSubclassLike(tensor: Tensor) -> _bool: ...
-def _dispatch_dump(dispatch: str) -> str: ...
+def _dispatch_key_name(dispatch: _dispatchkey) -> str: ...
+def _dispatch_key_parse(dispatch: _dispatchkey) -> DispatchKey: ...
+def _dispatch_num_backends() -> _int: ...
+
+class DispatchKey(Enum):
+    ${dispatch_key_hints}
+
+class DispatchKeySet:
+    def __or__(self, other: DispatchKeySet) -> DispatchKeySet: ...
+    def __sub__(self, other: DispatchKeySet) -> DispatchKeySet: ...
+    def __and__(self, other: DispatchKeySet) -> DispatchKeySet: ...
+    def highestPriorityTypeId(self) -> DispatchKey: ...
+    def has(self, k: _dispatchkey) -> _bool: ...
+    def __repr__(self) -> str: ...
+
+_dispatch_autogradother_backends: DispatchKeySet
+def _dispatch_has_backend_fallback(dispatch: _dispatchkey) -> _bool: ...
+def _dispatch_keyset_full_after(t: _dispatchkey) -> DispatchKeySet: ...
+def _dispatch_keyset_to_string(keyset: DispatchKeySet) -> str: ...
+def _dispatch_get_backend_keyset_from_autograd(dispatch: _dispatchkey) -> DispatchKeySet: ...
+def _dispatch_keys(tensor: Tensor) -> DispatchKeySet: ...
+def _dispatch_tls_local_exclude_set() -> DispatchKeySet: ...
+def _dispatch_tls_local_include_set() -> DispatchKeySet: ...
+def _dispatch_is_included_in_alias(dispatch_a: _dispatchkey, dispatch_b: _dispatchkey) -> _bool: ...
+
+class ExcludeDispatchKeyGuard:
+    pass
 
 class _AutoDispatchBelowAutograd:
     pass
 
+def _dispatch_print_registrations_for_dispatch_key(dispatch_key: str = "") -> None: ...
+def _dispatch_get_registrations_for_dispatch_key(dispatch_key: str = "") -> List[str]: ...
+
+
+# Define in torch/csrc/autograd/init.cpp
+class _DisablePythonDispatcher(object):
+    pass
+
+class _EnablePythonDispatcher(object):
+    pass
+
+def _set_python_dispatcher(dispatcher: object) -> None: ...
+
+
 # Defined in torch/csrc/utils/init.cpp
 class BenchmarkConfig(object):
     num_calling_threads: _int
@@ -1088,6 +1152,7 @@ def _cuda_getCompiledVersion() -> _int: ...
 def _cuda_cudaHostAllocator() -> _int: ...
 def _cuda_cudaCachingAllocator_raw_alloc(size: _int, cuda_stream: _int) -> _int: ...
 def _cuda_cudaCachingAllocator_raw_delete(ptr: _int) -> None: ...
+def _cuda_cudaCachingAllocator_set_allocator_settings(env: str) -> None: ...
 def _cuda_setMemoryFraction(fraction: _float, device: _int) -> None: ...
 def _cuda_emptyCache() -> None: ...
 def _cuda_memoryStats(device: _int) -> Dict[str, Any]: ...
diff --git a/torch/_C/_profiler.pyi b/torch/_C/_profiler.pyi
index 06676df7ce42..15db2f6ec2cf 100644
--- a/torch/_C/_profiler.pyi
+++ b/torch/_C/_profiler.pyi
@@ -35,6 +35,7 @@ class _ExperimentalConfig:
         self,
         profiler_metrics: List[str] = ...,
         profiler_measure_per_kernel: bool = ...,
+        verbose: bool = ...,
     ) -> None: ...
     ...
 
diff --git a/torch/__init__.py b/torch/__init__.py
index a6e8bc295d08..c5c3e69ddcb8 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -29,7 +29,7 @@
 
 from ._six import string_classes as _string_classes
 
-from typing import Set, Type, TYPE_CHECKING, Union, Callable
+from typing import Set, Type, TYPE_CHECKING, Union, Callable, Any
 import builtins
 
 __all__ = [
@@ -416,6 +416,9 @@ def use_deterministic_algorithms(mode, *, warn_only=False):
         * :class:`torch.nn.AdaptiveMaxPool2d` when attempting to differentiate a CUDA tensor
         * :class:`torch.nn.FractionalMaxPool2d` when attempting to differentiate a CUDA tensor
         * :class:`torch.nn.FractionalMaxPool3d` when attempting to differentiate a CUDA tensor
+        * :class:`torch.nn.MaxUnpool1d`
+        * :class:`torch.nn.MaxUnpool2d`
+        * :class:`torch.nn.MaxUnpool3d`
         * :func:`torch.nn.functional.interpolate` when attempting to differentiate a CUDA tensor
           and one of the following modes is used:
 
@@ -841,6 +844,7 @@ def _assert(condition, message):
 )
 from torch import fft as fft
 from torch import futures as futures
+from torch import nested as nested
 from torch import nn as nn
 from torch import optim as optim
 import torch.optim._multi_tensor
@@ -892,6 +896,28 @@ def compiled_with_cxx11_abi():
 from torch._ops import ops
 from torch._classes import classes
 
+# Import from torch._decomp import decompositions_for_jvp to register
+# decompositions for jvp to the jit registry
+# (decompositions_for_jvp depends on torch.ops, so we place it after)
+#
+# FIXME: We specify that __debug__ must be True because
+# if python is run with -OO or -O flags (i.e., __debug__ is False), we encounter the
+# following error:
+#
+# Return value was annotated as having type Tuple[NoneType, NoneType] but is actually of
+# type Tuple[Tensor, Tensor]:
+#   File ".../torch/_decomp/__init__.py", line 1585
+#     else:
+#         buffer = z
+#     return min - torch.log1p(z), buffer
+#     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
+if (os.environ.get("PYTORCH_JIT", "1") == "1" and
+        __debug__ and
+        not torch._C._is_deploy_enabled() and
+        os.environ.get('PYTORCH_DISABLE_LIBRARY', "0") == "0"):
+    from torch._decomp import decompositions_for_jvp
+    del decompositions_for_jvp
+
 # quantization depends on torch.fx
 # Import quantization
 from torch import quantization as quantization
@@ -929,7 +955,7 @@ def compiled_with_cxx11_abi():
 from . import _masked
 
 # Import removed ops with error message about removal
-from ._linalg_utils import solve
+from ._linalg_utils import eig, solve
 
 
 def _register_device_module(device_type, module):
@@ -951,7 +977,7 @@ def _register_device_module(device_type, module):
 
 # expose return_types
 from . import return_types
-if sys.executable != 'torch_deploy':
+if sys.executable != 'torch_deploy' and os.environ.get('PYTORCH_DISABLE_LIBRARY', "0") == "0":
     from . import library
     if not TYPE_CHECKING:
         from . import _meta_registrations
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index da0b638690d0..5adf402967d9 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -8,7 +8,7 @@
 import torch.nn.functional as F
 from torch import Tensor
 from torch._decomp import register_decomposition
-from torch._prims_common import TensorSequenceType
+from torch._prims_common import NumberType, TensorLike, TensorSequenceType
 from torch._prims_common.wrappers import out_wrapper
 from torch.utils._pytree import tree_flatten, tree_map
 
@@ -754,23 +754,37 @@ def native_dropout(input: Tensor, p: float, train: Optional[bool]):
         return (input, torch.ones_like(input, dtype=torch.bool))
 
 
-# TODO: Correct the type promotion semantics
 @register_decomposition(aten._softmax)
-@pw_cast_for_opmath
 def _softmax(x: Tensor, dim: int, half_to_float: bool):
+    if half_to_float:
+        assert x.dtype == torch.half
+    computation_dtype, result_dtype = utils.elementwise_dtypes(
+        x, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+    x = x.to(computation_dtype)
     x_max = torch.amax(x, dim, keepdim=True)
     unnormalized = torch.exp(x - x_max)
-    return unnormalized / torch.sum(unnormalized, dim, keepdim=True)
+    result = unnormalized / torch.sum(unnormalized, dim, keepdim=True)
+    if not half_to_float:
+        result = result.to(result_dtype)
+    return result
 
 
-# TODO: Correct the type promotion semantics
 @register_decomposition(aten._log_softmax)
-@pw_cast_for_opmath
 def _log_softmax(x: Tensor, dim: int, half_to_float: bool):
+    if half_to_float:
+        assert x.dtype == torch.half
+    computation_dtype, result_dtype = utils.elementwise_dtypes(
+        x, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+    x = x.to(computation_dtype)
     x_max = torch.amax(x, dim, keepdim=True)
     shifted = x - x_max
     shifted_logsumexp = torch.log(torch.sum(torch.exp(shifted), dim, keepdim=True))
-    return shifted - shifted_logsumexp
+    result = shifted - shifted_logsumexp
+    if not half_to_float:
+        result = result.to(result_dtype)
+    return result
 
 
 # Remove special case when https://github.com/pytorch/pytorch/pull/72949 is landed.
@@ -1275,11 +1289,9 @@ def std_decomposition(
 # Questionable decompositions
 # This is only valid if we're running the graph without autograd, such as if the backward pass has been traced.
 # Note that this decomposition causes issues with in-place ops
-@register_decomposition(
-    [aten.detach, aten.lift, aten.lift_fresh, aten.alias], disable_meta=True
-)
+@register_decomposition([aten.detach, aten.lift, aten.lift_fresh], disable_meta=True)
 def nop_decomposition(x):
-    return x
+    return aten.alias(x)
 
 
 @register_decomposition(aten.cudnn_batch_norm)
@@ -1547,6 +1559,32 @@ def maybe_mask(vals, length, range_max, adaptive, dim):
     return ret / (length_h * length_w)
 
 
+@register_decomposition(aten.index_add_)
+def index_add_(
+    x: TensorLike,
+    dim: int,
+    index: TensorLike,
+    tensor: TensorLike,
+    *,
+    alpha: NumberType = 1,
+):
+    dim = utils.canonicalize_dims(x.ndim, dim)
+    utils.check(
+        index.ndim <= 1,
+        lambda: f"Index should have dimension 1 or 0 (got {index.ndim})",
+    )
+    if alpha != 1:
+        python_type = utils.dtype_to_type(x.dtype)
+        utils.check(
+            utils.is_weakly_lesser_type(type(alpha), python_type),
+            lambda: f"alpha argument of type {type(alpha)} cannot be safely cast to type {python_type}!",
+        )
+        tensor = torch._prims.mul(tensor, alpha)
+    idx = (slice(None),) * dim + (index,)
+    torch.ops.aten.index_put_(x, idx, tensor, accumulate=True)
+    return x
+
+
 def _squeeze_multiple(self: Tensor, dims: List[int]) -> Tensor:
     ndim = self.dim()
     wrapped_dims = utils.canonicalize_dims(ndim, dims)
@@ -1923,3 +1961,72 @@ def get_coeff(ofs: int) -> Tensor:
 
         coeffs = tuple((get_coeff(ofs) for ofs in range(4)))
         return cubic_interp1d(coeffs, ty)
+
+
+@register_decomposition(aten.mv)
+@pw_cast_for_opmath
+def mv(self, vec):
+    utils.check(
+        self.dim() == 2 and vec.dim() == 1,
+        lambda: f"matrix @ vector expected, got {self.dim()}, {vec.dim()}",
+    )
+    utils.check(
+        self.size(1) == vec.size(0),
+        lambda: f"size mismatch, got {self.size(0)}x{self.size(1)},{vec.size(0)}",
+    )
+    return (self * vec).sum(dim=1)
+
+
+@register_decomposition(aten.dot, disable_meta=True)
+@pw_cast_for_opmath
+def dot(self, other):
+    if self.is_complex():
+        if self.is_conj():
+            if other.is_conj():
+                return torch.dot(self.conj(), other.conj()).conj()
+            else:
+                return torch.vdot(self.conj(), other)
+        elif other.is_conj():
+            return torch.vdot(other.conj(), self)
+
+    utils.check(
+        self.dim() == 1 and other.dim() == 1,
+        lambda: f"1D tensors expected, but got {self.dim()}D and {other.dim()}D tensors",
+    )
+    utils.check(
+        self.dtype == other.dtype,
+        lambda: f"dot : expected both vectors to have same dtype, but found {self.dtype} and {other.dtype}",
+    )
+
+    def numel_error():
+        return (
+            f"inconsistent tensor size, expected tensor [{self.numel()}] and src [{other.numel()}] to have the"
+            f"same number of elements, but got {self.numel()} and {other.numel()} elements respectively"
+        )
+
+    utils.check(self.numel() == other.numel(), numel_error)
+
+    return (self * other).sum()
+
+
+@register_decomposition(aten.binary_cross_entropy_with_logits)
+def binary_cross_entropy_with_logits(
+    self, target, weight=None, pos_weight=None, reduction=Reduction.MEAN.value
+):
+    max_val = (-self).clamp_min(0)
+    if pos_weight is not None:
+        log_weight = (pos_weight - 1) * target + 1
+        loss = (1 - target) * self + log_weight * (
+            ((-max_val).exp() + (-self - max_val).exp()).log() + max_val
+        )
+    else:
+        loss = (
+            (1 - target) * self
+            + max_val
+            + ((-max_val).exp() + (-self - max_val).exp()).log()
+        )
+
+    if weight is not None:
+        loss = loss * weight
+
+    return apply_loss_reduction(loss, reduction)
diff --git a/torch/_decomp/decompositions_for_jvp.py b/torch/_decomp/decompositions_for_jvp.py
index b8c541966bc2..ca72ed3f937f 100644
--- a/torch/_decomp/decompositions_for_jvp.py
+++ b/torch/_decomp/decompositions_for_jvp.py
@@ -10,7 +10,7 @@
 register_decomposition = torch._decomp.register_decomposition
 aten = torch.ops.aten
 
-# NOTE: [forward-mode AD decompositions hack]
+# NOTE: [forward-mode AD decompositions mechanism]
 #
 # The mechanism is in VariableType,
 #   IF any inputs have forward grad
@@ -23,9 +23,15 @@
 # Note that we would be building the backward graph at the decomposed level
 # too, but that is OK, because we would've errored out otherwise anyway.
 #
-# TODO: what if jit decompositions exists, should we just use it?
-#       or do we want to have an explicit white list like functorch had
-#       using special JVP_DECOMP DynamicLayerFront kernel
+# TODO: The mechanism we are using to register decompositions doesn't
+# seem to be exclusively used for jvp. So open question here is whether
+# torch/csrc/jit/runtime/decomposition_registry.cpp is being used for other things.
+# If that is the case, we may go down the decomposition path unexpectedly
+# (and possibly produce an unintelligible error) vs erroring out earlier and
+# printing that the forward AD formula is not implemented.
+#
+# The solution to this may be to have a explicitly white list control when
+# to enable the decomposition.
 
 
 def maybe_register_decomposition(op):
@@ -179,7 +185,7 @@ def native_layer_norm_backward(
         if len(outer_dim_indices) > 0:
             d_bias: Optional[Tensor] = torch.sum(grad_out, outer_dim_indices, False)
         else:
-            d_bias = grad_out
+            d_bias = grad_out.clone()
     elif bias is not None:
         d_bias = torch.zeros_like(bias)  # should be None but doesn't work with vjp
     else:
diff --git a/torch/_dispatch/_dispatcher.py b/torch/_dispatch/_dispatcher.py
deleted file mode 100644
index 1f3a37ccbfed..000000000000
--- a/torch/_dispatch/_dispatcher.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import torch
-import torch._C as _C
-from torch.utils._pytree import tree_flatten
-
-"""
-This is a dispatcher (in Python)
-- You can define new operations (in Python) without schemas
-- It interfaces with the PyTorch dispatcher
-"""
-
-class PyDispatcher:
-    # operator is a PyOperator
-    @staticmethod
-    def call(operator, *args, **kwargs):
-        dispatch_key_set = compute_keyset(args, kwargs)
-        kernel = operator.lookup(dispatch_key_set)
-        return kernel(*args, **kwargs)
-
-    # operator is a PyOperator
-    @staticmethod
-    def redispatch(operator, dispatch_key_set, *args, **kwargs):
-        kernel = operator.lookup(dispatch_key_set)
-        return kernel(*args, **kwargs)
-
-
-def compute_keyset(args, kwargs):
-    tensors = get_tensors(args, kwargs)
-    return key_extractor(tensors)
-
-
-# Note - this should maintain identical impl to the C++ dispatcher key extraction logic
-# at ATen/core/dispatch/DispatchKeyExtractor.h
-def key_extractor(tensors):
-    key_set = _C._dispatch_tls_local_include_set()  # type: ignore[attr-defined]
-    for tensor in tensors:
-        key_set = key_set | _C._dispatch_keys(tensor)  # type: ignore[attr-defined]
-    key_set = key_set - _C._dispatch_tls_local_exclude_set()  # type: ignore[attr-defined]
-    return key_set
-
-
-def to_flat_tuple(args, kwargs):
-    flat_args, _ = tree_flatten(args)
-    flat_kwargs, _ = tree_flatten(kwargs)
-    flat_all = flat_args + flat_kwargs
-    return flat_all
-
-def get_tensors(args, kwargs):
-    flat_all = to_flat_tuple(args, kwargs)
-    tensor_args = [t for t in flat_all if isinstance(t, torch.Tensor)]
-    return tuple(tensor_args)
diff --git a/torch/_dispatch/python.py b/torch/_dispatch/python.py
new file mode 100644
index 000000000000..95b7fa05bfe2
--- /dev/null
+++ b/torch/_dispatch/python.py
@@ -0,0 +1,20 @@
+import torch._C
+from contextlib import contextmanager
+
+__all__ = ['enable_python_dispatcher', 'no_python_dispatcher']
+
+@contextmanager
+def no_python_dispatcher():
+    g = torch._C._DisablePythonDispatcher()
+    try:
+        yield
+    finally:
+        del g
+
+@contextmanager
+def enable_python_dispatcher():
+    g = torch._C._EnablePythonDispatcher()
+    try:
+        yield
+    finally:
+        del g
diff --git a/torch/_linalg_utils.py b/torch/_linalg_utils.py
index d7f6798dd9d7..b9261cb25ae5 100644
--- a/torch/_linalg_utils.py
+++ b/torch/_linalg_utils.py
@@ -96,9 +96,17 @@ def symeig(A: Tensor, largest: Optional[bool] = False) -> Tuple[Tensor, Tensor]:
     return E, Z
 
 
-# This function was deprecated and removed
+# These functions were deprecated and removed
 # This nice error message can be removed in version 1.13+
 def solve(input: Tensor, A: Tensor, *, out=None) -> Tuple[Tensor, Tensor]:
     raise RuntimeError(
         "This function was deprecated since version 1.9 and is now removed. Please use the `torch.linalg.solve` function instead.",
     )
+
+
+def eig(
+    self: Tensor, eigenvectors: bool = False, *, e=None, v=None
+) -> Tuple[Tensor, Tensor]:
+    raise RuntimeError(
+        "This function was deprecated since version 1.9 and is now removed. Please use the `torch.linalg.eig` function instead.",
+    )
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 71ddd58f352c..dc351dc77ef8 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -85,6 +85,11 @@ def meta_fft_c2r(self, dim, normalization, lastdim):
     return self.new_empty(output_sizes, dtype=toRealValueType(self.dtype))
 
 
+@register_meta(aten.copy_.default, register_dispatcher=False)
+def meta_copy_(self, src, non_blocking=False):
+    return self
+
+
 # Implementations below are taken from https://github.com/albanD/subclass_zoo/blob/main/python_meta_tensor.py
 @register_meta(aten.index_select.default)
 def meta_index_select(self, dim, index):
@@ -327,7 +332,7 @@ def pick_memory_format():
 
     else:
         out_channels = weight.shape[0]
-        if weight.shape[1] != input_tensor.shape[1] / groups:
+        if weight.shape[1] * groups != input_tensor.shape[1]:
             raise RuntimeError("Invalid channel dimensions")
         shape_out = calc_conv_nd_return_shape(
             dims, kernel_size, stride, padding, dilation
@@ -737,6 +742,11 @@ def meta_repeat(self, repeats):
     return self.new_empty(target_size)
 
 
+@register_meta(aten.alias.default, register_dispatcher=False)
+def meta_alias(self):
+    return self.view(self.shape)
+
+
 # We must also trigger meta registrations from PrimTorch ref
 # decompositions
 import torch._refs
diff --git a/torch/_ops.py b/torch/_ops.py
index 21af70bbd2ee..4fa22febfa05 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -1,7 +1,10 @@
 import contextlib
 import ctypes
+import inspect
 import sys
 import types
+from abc import ABC
+from typing import Any, Dict
 
 import torch._C
 
@@ -26,9 +29,195 @@ def dl_open_guard():
         sys.setdlopenflags(old_flags)
 
 
+def has_key(op, k):
+    return (
+        torch._C._dispatch_has_kernel_for_dispatch_key(op.name(), k)
+        or k in op.py_kernels
+    )
+
+
+# TODO(voz) We are missing an entire axis of registration - Modes for the python key
+class PyOperatorABC(ABC):
+    def __call__(self, *args, **kwargs):
+        pass
+
+    def py_impl(self, dispatch_key, fn):
+        pass
+
+    def name(self):
+        pass
+
+
+is_included_in_alias = torch._C._dispatch_is_included_in_alias
+
+DispatchKey = torch._C.DispatchKey
+
+# Equivalent to computeDispatchTableEntryWithDebug
+def resolve_key(op: PyOperatorABC, k: DispatchKey):  # type: ignore[valid-type]
+    # 1. (Direct) operator registration
+    if has_key(op, k):
+        return k
+    # 2.1 Use CompositeExplicitAutogradNonFunctional kernel if available
+    cand = DispatchKey.CompositeExplicitAutogradNonFunctional
+    if (k == DispatchKey.Undefined or is_included_in_alias(k, cand)) and has_key(
+        op, cand
+    ):
+        return cand
+    # 2.2 Use CompositeExplicitAutograd kernel if available
+    cand = DispatchKey.CompositeExplicitAutograd
+    if (k == DispatchKey.Undefined or is_included_in_alias(k, cand)) and has_key(
+        op, cand
+    ):
+        return cand
+    has_backend_kernel = torch._C._dispatch_has_kernel_for_any_dispatch_key(
+        op.name(), torch._C._dispatch_get_backend_keyset_from_autograd(k)
+    ) or has_key(op, DispatchKey.CompositeExplicitAutograd)
+    # 2.3. Use CompositeImplicitAutograd kernel if available
+    cand = DispatchKey.CompositeImplicitAutogradNestedTensor
+    if (
+        (k != DispatchKey.Undefined and is_included_in_alias(k, cand))
+        and has_key(op, cand)
+        and not has_backend_kernel
+    ):
+        return cand
+    cand = DispatchKey.CompositeImplicitAutograd
+    if (k == DispatchKey.Undefined or is_included_in_alias(k, cand)) and has_key(
+        op, cand
+    ):
+        if (
+            k == DispatchKey.AutogradOther
+            and torch._C._dispatch_has_kernel_for_any_dispatch_key(
+                op.name(), torch._C._dispatch_autogradother_backends
+            )
+        ):
+            raise RuntimeError("ambiguous autogradother kernel")
+        elif not has_backend_kernel:
+            return cand
+    # 2.4. For autograd backend keys, use kernel from DispatchKey::Autograd if available
+    cand = DispatchKey.Autograd
+    if is_included_in_alias(k, cand) and has_key(op, cand):
+        return cand
+    # Backend fallback
+    if torch._C._dispatch_has_backend_fallback(k):
+        # The dispatch key itself will implicitly route to backend fallback.
+        # This is probably not great for the pure Python implementation.
+        return k
+    raise RuntimeError("could not find kernel")
+
+
+pyop_namespace = {}
+
+
+class PyOperator(PyOperatorABC):
+    def __init__(self, name):
+        self._name = name
+        self.table = {}
+        self.python_key_mode_table = {}
+
+        # Make _OPNamespace not scream, this whole name based association needs a good hard look
+        self.__name__ = name
+        pyop_namespace[name] = self
+
+    def fallthrough(self, dispatch_key):
+        self.table[dispatch_key] = self._fallthrough_fn(self, dispatch_key)
+
+    def py_impl(self, dispatch_key_or_mode):
+        def inner(fn):
+            if inspect.isclass(dispatch_key_or_mode) and issubclass(
+                dispatch_key_or_mode, torch.utils._python_dispatch.TorchDispatchMode
+            ):
+                mode = dispatch_key_or_mode
+                assert mode not in self.python_key_mode_table
+                # TODO(voz): Should we replace setting torch._C.DispatchKey.Python entirely with setting mode keys?
+                self.python_key_mode_table[mode] = fn
+                return fn
+
+            dispatch_key = dispatch_key_or_mode
+            assert (
+                dispatch_key != torch._C.DispatchKey.Python
+            ), "Please register a mode for the torch._C.DispatchKey.Python key instead."
+            assert isinstance(dispatch_key, torch._C.DispatchKey)
+            assert dispatch_key not in self.table
+            self.table[dispatch_key] = fn
+            return fn
+
+        return inner
+
+    def dispatch(self, dispatch_key, *args, **kwargs):
+        if dispatch_key == torch._C.DispatchKey.Python:
+            # TODO(voz): We should walk all the nodes here / turn it into a list, topmode is ok for now.
+            curr_mode = type(torch._C._get_torch_dispatch_mode())
+            assert (
+                curr_mode is not None
+            ), "Illegal invocation of dispatch on torch._C.DispatchKey.Python without a mode."
+            assert (
+                curr_mode in self.python_key_mode_table
+            ), f"Current active mode {curr_mode} not registered"
+            # TODO(voz): The idea behind this is that we do not yet support dispatch by key + mode, only key.
+            return self.python_key_mode_table[curr_mode](*args, **kwargs)
+
+        assert dispatch_key in self.table
+        return self.table[dispatch_key](*args, **kwargs)
+
+    def __call__(self, *args, **kwargs):
+        flat_args = _to_flat_tuple(args, kwargs)
+        if torch.overrides.has_torch_function(flat_args):
+            return torch.overrides.handle_torch_function(
+                self, flat_args, *args, **kwargs
+            )
+
+        dispatch_key_set = _compute_keyset(args, kwargs)
+        return self.dispatch(dispatch_key_set.highestPriorityTypeId(), *args, **kwargs)
+
+    def name(self):
+        return self.name
+
+    # TODO(voz): Should rewrite fallthrough register as the impl for keys we do not specify
+    # as opposed to being this sort of explicit thing where ops are a little too key aware...
+    def _fallthrough_fn(self, operator, dispatch_key):
+        def inner(*args, **kwargs):
+            all_keys_after_current = torch._C._dispatch_keyset_full_after(dispatch_key)
+            all_keys_after_current_masked = all_keys_after_current & _compute_keyset(
+                args, kwargs
+            )
+            return self.dispatch(
+                all_keys_after_current_masked.highestPriorityTypeId(), *args, **kwargs
+            )
+
+        return inner
+
+
+def _to_flat_tuple(args, kwargs):
+    flat_args, _ = torch.utils._pytree.tree_flatten(args)
+    flat_kwargs, _ = torch.utils._pytree.tree_flatten(kwargs)
+    flat_all = flat_args + flat_kwargs
+    return flat_all
+
+
+def _compute_keyset(args, kwargs):
+    tensors = _get_tensors(args, kwargs)
+    return key_extractor(tensors)
+
+
+def _get_tensors(args, kwargs):
+    flat_all = _to_flat_tuple(args, kwargs)
+    tensor_args = [t for t in flat_all if isinstance(t, torch.Tensor)]
+    return tuple(tensor_args)
+
+
+# Note - this should maintain identical impl to the C++ dispatcher key extraction logic
+# at ATen/core/dispatch/DispatchKeyExtractor.h
+def key_extractor(tensors):
+    key_set = torch._C._dispatch_tls_local_include_set()
+    for tensor in tensors:
+        key_set = key_set | torch._C._dispatch_keys(tensor)
+    key_set = key_set - torch._C._dispatch_tls_local_exclude_set()
+    return key_set
+
+
 # Each OpOverload object contains pointer to a a specific operator overload, a pointer to the parent `OpOverloadPacket` object.
 # You can obtain an OpOverload object through attribute query on OpOverloadPacket.
-class OpOverload:
+class OpOverload(PyOperatorABC):
     def __init__(self, overloadpacket, op, op_dk, schema, tags):
         self._op = op
         self._op_dk = op_dk
@@ -38,14 +227,19 @@ def __init__(self, overloadpacket, op, op_dk, schema, tags):
         self._overloadname = (
             "default" if schema.overload_name == "" else schema.overload_name
         )
-        self.name = self._schema.name
+        self._name = self._schema.name
         if schema.overload_name:
-            self.name += "." + schema.overload_name
+            self._name += "." + schema.overload_name
+        self.py_kernels: Dict[torch._C.DispatchKey, Any] = {}  # type: ignore[name-defined]
         self.__name__ = "{}.{}".format(
             self._schema.name.split("::")[1], self._overloadname
         )
+        # TODO(voz): Lots of shared logic around python_key_mode_table, maybe pull into base...
+        self.python_key_mode_table = {}
         self.__module__ = overloadpacket.__module__
         op.__module__ = overloadpacket.__module__
+        self.__qualname__ = self._name
+        self.__annotations__ = {}
 
     # it's a no-op since OpOverload object is immutable and must be unique for a given op overload.
     def __deepcopy__(self, memo=None):
@@ -59,9 +253,6 @@ def __repr__(self):
     def __call__(self, *args, **kwargs):
         return self._op(*args, **kwargs or {})
 
-    def __getattr__(self, key):
-        return getattr(self._op, key)
-
     def __hash__(self):
         return hash(self._op)
 
@@ -70,12 +261,79 @@ def __str__(self):
         return "{}.{}.{}".format(*self._schema.name.split("::"), self._overloadname)
 
     def decompose(self, *args, **kwargs):
-        dk = "CompositeImplicitAutograd"
-        if torch._C._dispatch_has_kernel_for_dispatch_key(self.name, dk):
+        dk = torch._C.DispatchKey.CompositeImplicitAutograd
+        if dk in self.py_kernels:
+            # NB: This branch is not too necessary anymore, because we can
+            # apply Python CompositeImplicitAutograd *before* tracing
+            # using Python dispatcher (also taking advantage of the autograd
+            # formula).  But it's included for completeness
+            return self.py_kernels[dk](*args, **kwargs)
+        elif torch._C._dispatch_has_kernel_for_dispatch_key(self.name(), dk):
             return self._op_dk(dk, *args, **kwargs)
         else:
             return NotImplemented
 
+    def py_impl(self, dispatch_key_or_mode):
+        def inner(fn):
+            if inspect.isclass(dispatch_key_or_mode) and issubclass(
+                dispatch_key_or_mode, torch.utils._python_dispatch.TorchDispatchMode
+            ):
+                mode = dispatch_key_or_mode
+                assert mode not in self.python_key_mode_table
+                # TODO(voz): Should we replace setting torch._C.DispatchKey.Python entirely with setting mode keys?
+                self.python_key_mode_table[mode] = fn
+                return fn
+
+            assert isinstance(dispatch_key_or_mode, torch._C.DispatchKey)
+            assert (
+                dispatch_key_or_mode != torch._C.DispatchKey.Python
+            ), "Please register a mode for the torch._C.DispatchKey.Python key instead."
+
+            self.py_kernels[dispatch_key_or_mode] = fn
+            return fn
+
+        return inner
+
+    # This implements the pre-computation logic for the Python dispatcher.
+    def __getattr__(self, attr):
+        if len(attr) == 0 or not attr[0].isupper():
+            raise AttributeError()
+
+        try:
+            key = torch._C._dispatch_key_parse(attr)
+        except Exception as e:
+            raise AttributeError()
+
+        if key == torch._C.DispatchKey.Python:
+            if not self.python_key_mode_table:
+                setattr(self, attr, key)
+                return key
+
+            def handler(*args, **kwargs):
+                # TODO: We also need to handle tensor subclasses here
+                # TODO(voz): We should walk all the nodes here / turn it into a list, topmode is ok for now.
+                curr_mode = type(torch._C._get_torch_dispatch_mode())
+                assert (
+                    curr_mode is not None
+                ), "Illegal invocation of dispatch on torch._C.DispatchKey.Python without a mode."
+                if curr_mode not in self.python_key_mode_table:
+                    # TODO: This path is slow, should generally encourage this
+                    # case to not happen
+                    return self._op_dk(key, *args, **kwargs)
+                # TODO(voz): The idea behind this is that we do not yet support dispatch by key + mode, only key.
+                return self.python_key_mode_table[curr_mode](*args, **kwargs)
+
+            setattr(self, attr, handler)
+            return handler
+
+        key = resolve_key(self, key)
+        r = self.py_kernels.get(key, key)
+        setattr(self, attr, r)
+        return r
+
+    def name(self):
+        return self._name
+
     @property
     def overloadpacket(self):
         return self._overloadpacket
@@ -223,6 +481,8 @@ def __getattr__(self, op_name):
         # It is not a valid op_name when __file__ is passed in
         if op_name == "__file__":
             return "torch.ops"
+        elif op_name == "__origin__":
+            raise AttributeError()
 
         # Get the op `my_namespace::my_op` if available. This will also check
         # for overloads and raise an exception if there are more than one.
@@ -251,14 +511,25 @@ def __getattr__(self, op_name):
         return opoverloadpacket
 
 
+class _PyOpNamespace(_OpNamespace):
+    def __init__(self):
+        super(_PyOpNamespace, self).__init__("torch.ops")
+        self.pyop_namespace = pyop_namespace
+
+
 class _Ops(types.ModuleType):
     __file__ = "_ops.py"
 
     def __init__(self):
         super(_Ops, self).__init__("torch.ops")
         self.loaded_libraries = set()
+        self.pyops = _PyOpNamespace()
 
     def __getattr__(self, name):
+        # Check if the name is a pyop
+        if name in self.pyops.pyop_namespace:
+            return self.pyops.pyop_namespace[name]
+
         # Here we are creating `torch.ops.my_namespace`
         namespace = _OpNamespace(name)
         setattr(self, name, namespace)
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index d3170e439811..e484d1cab16b 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -239,13 +239,7 @@ def TensorMeta(
     if isinstance(device, str):
         device = torch.device(device)
 
-    # SymInt doesnt support empty_strided yet
-    if any(
-        isinstance(inp, torch.SymIntNode) for inp in itertools.chain(shape, strides)
-    ):
-        return torch.empty(shape, dtype=dtype, device=device)
-    else:
-        return torch.empty_strided(shape, strides, dtype=dtype, device=device)
+    return torch.empty_strided(shape, strides, dtype=dtype, device=device)
 
 
 def _make_prim(
diff --git a/torch/_prims/executor.py b/torch/_prims/executor.py
index 1dfbee2c95a4..2d8d815f0638 100644
--- a/torch/_prims/executor.py
+++ b/torch/_prims/executor.py
@@ -1,4 +1,4 @@
-from typing import Callable
+from typing import Callable, Optional
 
 from torch._prims.context import NvfuserPrimsMode, TorchRefsMode
 from torch._prims.nvfuser_executor import nvfuser_execute, nvfuser_execute_partitioned
@@ -7,7 +7,12 @@
 from torch.fx.experimental.proxy_tensor import make_fx, wrapper_and_args_for_make_fx
 
 
-def execute(gm: GraphModule, *args, executor: str = "aten"):
+def execute(
+    gm: GraphModule,
+    *args,
+    executor: str = "aten",
+    executor_parameters: Optional[dict] = None,
+):
     """
     Prototype ATen executor.
 
@@ -17,9 +22,11 @@ def execute(gm: GraphModule, *args, executor: str = "aten"):
     if executor == "aten":
         return gm.forward(*args)
     elif executor == "nvfuser":
-        return nvfuser_execute_partitioned(gm, *args)
+        return nvfuser_execute_partitioned(
+            gm, *args, executor_parameters=executor_parameters
+        )
     elif executor == "strictly_nvfuser":
-        return nvfuser_execute(gm, *args)
+        return nvfuser_execute(gm, *args, executor_parameters=executor_parameters)
 
     msg = "Received unexpected value for 'executor': {0}. Allowed values are: aten, nvfuser.".format(
         executor
diff --git a/torch/_prims/nvfuser_executor.py b/torch/_prims/nvfuser_executor.py
index 6fa5f97b76cd..151dd135bf26 100644
--- a/torch/_prims/nvfuser_executor.py
+++ b/torch/_prims/nvfuser_executor.py
@@ -1,6 +1,7 @@
 from copy import deepcopy
 from dataclasses import dataclass
 from functools import lru_cache
+from types import MappingProxyType
 from warnings import warn
 
 import torch
@@ -20,6 +21,12 @@
 else:
     DataType = None
 
+DEFAULT_NVFUSER_PYTHON_CONFIG = MappingProxyType(
+    {
+        "use_python_fusion_cache": True,
+        "allow_single_op_fusion": True,
+    }
+)
 
 # nvFuserTensorTemplate and nvFuserScalarTemplate are helper objects
 # for cached construction of the nvFuser's Fusion
@@ -156,7 +163,8 @@ def templates_to_nvfuser_inputs(arg):
     return fusion, unflatten_spec
 
 
-def nvfuser_execute(gm: GraphModule, *args):
+def nvfuser_execute(gm: GraphModule, *args, executor_parameters=None):
+    executor_parameters = executor_parameters or DEFAULT_NVFUSER_PYTHON_CONFIG
     flat_args, _ = tree_flatten(args)
 
     # check for cuda only fusion
@@ -172,7 +180,14 @@ def nvfuser_execute(gm: GraphModule, *args):
         # Construction of the fusion is expensive and cached based on the GraphModule
         # and symbolic nvFuser args.
         nv_template_args = to_nvfuser_template_args(flat_args)
-        fusion, unflatten_spec = make_nvfuser_fusion(gm, *nv_template_args)  # type: ignore[misc]
+        use_cache = executor_parameters.get(
+            "use_python_fusion_cache",
+            DEFAULT_NVFUSER_PYTHON_CONFIG["use_python_fusion_cache"],
+        )
+        if use_cache:
+            fusion, unflatten_spec = make_nvfuser_fusion(gm, *nv_template_args)  # type: ignore[misc]
+        else:
+            fusion, unflatten_spec = make_nvfuser_fusion.__wrapped__(gm, *nv_template_args)  # type: ignore[misc]
 
         # Inputs to fusion.execute correspond to the same template/symbolic inputs
         # marked with `define_tensor/scalar`
@@ -213,17 +228,22 @@ def call_module(self, target, args, kwargs):
 
 
 class NvfuserGraphModule(torch.nn.Module):
-    def __init__(self, gm):
+    def __init__(self, gm, use_python_fusion_cache):
         super().__init__()
         self.gm = gm
+        self.executor_parameters = {"use_python_fusion_cache": use_python_fusion_cache}
 
     def __call__(self, *args):
-        return nvfuser_execute(self.gm, *args)
+        return nvfuser_execute(
+            self.gm, *args, executor_parameters=self.executor_parameters
+        )
 
 
 # MyPy bug: https://github.com/python/mypy/issues/5107
-@lru_cache()  # type: ignore[arg-type]
-def maybe_partition_graph(gm: GraphModule):
+@lru_cache(maxsize=1024)  # type: ignore[arg-type]
+def maybe_partition_graph(
+    gm: GraphModule, allow_single_op_fusion: bool, use_python_fusion_cache: bool
+):
     supported_ops = NvfuserPrimOperatorSupport()
     call_function_nodes = list(
         filter(lambda n: n.op == "call_function", gm.graph.nodes)
@@ -248,7 +268,7 @@ def maybe_partition_graph(gm: GraphModule):
         # CapabilityBasedPartitioner modifies the graph in-place so we need to make a copy of the graph
         gm = deepcopy(gm)
         partitioner = CapabilityBasedPartitioner(
-            gm, supported_ops, allows_single_node_partition=True
+            gm, supported_ops, allows_single_node_partition=allow_single_op_fusion
         )
         partitions = partitioner.propose_partitions()
         if len(partitions) == 0:
@@ -268,18 +288,35 @@ def maybe_partition_graph(gm: GraphModule):
             if node.op == "call_module" and "fused_" in node.name:
                 nvfuser_submodule = getattr(partitioned_graph, node.name)
                 partitioned_graph.delete_submodule(node.target)
-                gm.add_submodule(node.target, NvfuserGraphModule(nvfuser_submodule))
+                gm.add_submodule(
+                    node.target,
+                    NvfuserGraphModule(nvfuser_submodule, use_python_fusion_cache),
+                )
 
         return partitioned_graph, any_unsupported
     else:
         return gm, any_unsupported
 
 
-def nvfuser_execute_partitioned(gm: GraphModule, *args):
+def nvfuser_execute_partitioned(gm: GraphModule, *args, executor_parameters=None):
+    executor_parameters = executor_parameters or DEFAULT_NVFUSER_PYTHON_CONFIG
+    # maybe_partition_graph function is cached so we can't use non-hashable arguments
+    allow_single_op_fusion = executor_parameters.get(
+        "allow_single_op_fusion",
+        DEFAULT_NVFUSER_PYTHON_CONFIG["allow_single_op_fusion"],
+    )
+    use_python_fusion_cache = executor_parameters.get(
+        "use_python_fusion_cache",
+        DEFAULT_NVFUSER_PYTHON_CONFIG["use_python_fusion_cache"],
+    )
     # When possible it's better to use nvfuser_execute directly
-    # because it avoids PartitionedInterpreter's overhead
-    gm, is_partitioned = maybe_partition_graph(gm)
+    # because it avoids GraphModule's overhead
+    gm, is_partitioned = maybe_partition_graph(
+        gm,
+        allow_single_op_fusion=allow_single_op_fusion,
+        use_python_fusion_cache=use_python_fusion_cache,
+    )
     if is_partitioned:
         return gm(*args)
     else:
-        return nvfuser_execute(gm, *args)
+        return nvfuser_execute(gm, *args, executor_parameters=executor_parameters)
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index 8c5bb64366f7..f6ee22404391 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -446,8 +446,9 @@ def validate_exclusive_idx(rank: int, ex_idx: int):
     assert ex_idx > 0 and ex_idx <= rank
 
 
-# "Wraps" a dim (up to one time) for the given rank, allowing
-# dims to be specified using negative indices
+# "Wraps" a dim (up to one time) for the given rank, allowing dims to be
+# specified using negative indices. For scalar tensors with rank 0, then idx
+# must be in the range [-1, 0]. Otherwise, idx should be in the range [-rank, rank-1].
 def canonicalize_dim(rank: int, idx: int, wrap_scalar: bool = True) -> int:
     if rank < 0:
         msg = f"Rank cannot be negative but got {rank}"
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 4bc95c67f4e9..5ee4e85a9c61 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -70,6 +70,13 @@
     "fill",
     "floor",
     "frac",
+    "index_add",
+    "index_add_",
+    "index_copy",
+    "index_copy_",
+    "index_select",
+    "index_fill",
+    "index_fill_",
     "isfinite",
     "isinf",
     "isnan",
@@ -85,9 +92,11 @@
     "reciprocal",
     "round",  # TODO: model kwargs
     "sigmoid",
+    "sgn",
     "sign",
     "signbit",
     "sin",
+    "sinc",
     "sinh",
     "sqrt",
     "square",
@@ -207,6 +216,7 @@
     "dsplit",
     "dstack",
     "expand",
+    "expand_as",
     "flatten",
     "flip",
     "fliplr",
@@ -219,6 +229,7 @@
     "native_layer_norm",
     "permute",
     "ravel",
+    "repeat",
     "reshape",
     "roll",
     "rot90",
@@ -229,6 +240,7 @@
     "t",
     "tensor_split",
     "transpose",
+    "unfold_copy",
     "unsqueeze",
     "view",
     "vsplit",
@@ -268,6 +280,7 @@
 ]
 
 Tensor = torch.Tensor
+DispatchKey = torch._C.DispatchKey  # type: ignore[attr-defined]
 
 
 def _broadcast_shapes(*_shapes):
@@ -434,7 +447,7 @@ def ceil(a):
 @register_decomposition(torch.ops.aten.conj_physical)
 @out_wrapper()
 def conj_physical(input: TensorLikeType):
-    if not input.dtype.is_complex:
+    if not utils.is_complex_dtype(input.dtype):
         return input
     return prims.conj_physical(input)
 
@@ -739,6 +752,15 @@ def sigmoid(a: TensorLikeType) -> TensorLikeType:
     return true_divide(1, add(1, exp(neg(a))))
 
 
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT)
+def sgn(a):
+    if utils.is_complex_dtype(a.dtype):
+        a_abs = a.abs()
+        return torch.where(a_abs == 0, 0, a / a_abs)
+    else:
+        return a.sign()
+
+
 @_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT)
 def sign(a):
     return prims.sign(a)
@@ -754,6 +776,14 @@ def sin(a):
     return prims.sin(a)
 
 
+# Autograd note: This will give the right first derivative at zero (by chance),
+# but not the right second derivative
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def sinc(a):
+    a = math.pi * a
+    return torch.where(a == 0, 1, torch.sin(a) / a)
+
+
 @_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
 def sinh(a):
     return prims.sinh(a)
@@ -876,7 +906,7 @@ def add(
 
 # TODO: add docstring
 atan2 = _make_elementwise_binary_reference(
-    prims.atan2,
+    prims.atan2,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
     supports_lhs_python_scalar=False,
     supports_rhs_python_scalar=False,
@@ -884,33 +914,33 @@ def add(
 
 # TODO: add docstring
 bitwise_and = _make_elementwise_binary_reference(
-    prims.bitwise_and,
+    prims.bitwise_and,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
 )
 
 # TODO: add docstring
 bitwise_left_shift = _make_elementwise_binary_reference(
-    prims.shift_left,
+    prims.shift_left,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
     aten_op=torch.ops.aten.bitwise_left_shift,  # prim/aten name mismatch
 )
 
 # TODO: add docstring
 bitwise_or = _make_elementwise_binary_reference(
-    prims.bitwise_or,
+    prims.bitwise_or,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
 )
 
 # TODO: add docstring
 bitwise_right_shift = _make_elementwise_binary_reference(
-    prims.shift_right_arithmetic,
+    prims.shift_right_arithmetic,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
     aten_op=torch.ops.aten.bitwise_right_shift,  # prim/aten name mismatch
 )
 
 # TODO: add docstring
 bitwise_xor = _make_elementwise_binary_reference(
-    prims.bitwise_xor,
+    prims.bitwise_xor,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
 )
 
@@ -967,7 +997,7 @@ def div(
 
 # TODO: add docstring
 eq = _make_elementwise_binary_reference(
-    prims.eq,
+    prims.eq,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
     supports_lhs_python_scalar=False,
 )
@@ -1131,7 +1161,7 @@ def _floor_divide_float(a: Tensor, b: Tensor) -> Tensor:
 
 # TODO: add docstring
 fmax = _make_elementwise_binary_reference(
-    prims.fmax,
+    prims.fmax,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
     aten_op=torch.ops.aten.fmax,
     supports_lhs_python_scalar=False,
@@ -1140,7 +1170,7 @@ def _floor_divide_float(a: Tensor, b: Tensor) -> Tensor:
 
 # TODO: add docstring
 fmin = _make_elementwise_binary_reference(
-    prims.fmin,
+    prims.fmin,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
     aten_op=torch.ops.aten.fmin,
     supports_lhs_python_scalar=False,
@@ -1149,7 +1179,7 @@ def _floor_divide_float(a: Tensor, b: Tensor) -> Tensor:
 
 # TODO: add docstring
 fmod = _make_elementwise_binary_reference(
-    prims.fmod,
+    prims.fmod,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
     aten_op=torch.ops.aten.fmod,
     supports_lhs_python_scalar=False,
@@ -1158,7 +1188,7 @@ def _floor_divide_float(a: Tensor, b: Tensor) -> Tensor:
 
 # TODO: add docstring
 gcd = _make_elementwise_binary_reference(
-    prims.gcd,
+    prims.gcd,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
     aten_op=torch.ops.aten.gcd,
     supports_lhs_python_scalar=False,
@@ -1167,14 +1197,14 @@ def _floor_divide_float(a: Tensor, b: Tensor) -> Tensor:
 
 # TODO: add docstring
 ge = _make_elementwise_binary_reference(
-    prims.ge,
+    prims.ge,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
     supports_lhs_python_scalar=False,
 )
 
 # TODO: add docstring
 gt = _make_elementwise_binary_reference(
-    prims.gt,
+    prims.gt,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
     supports_lhs_python_scalar=False,
 )
@@ -1197,21 +1227,21 @@ def _heaviside(input: TensorLikeType, values: TensorLikeType) -> TensorLikeType:
 )
 
 hypot = _make_elementwise_binary_reference(
-    prims.hypot,
+    prims.hypot,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
     supports_lhs_python_scalar=False,
     supports_rhs_python_scalar=False,
 )
 
 igamma = _make_elementwise_binary_reference(
-    prims.igamma,
+    prims.igamma,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
     supports_lhs_python_scalar=False,
     supports_rhs_python_scalar=False,
 )
 
 igammac = _make_elementwise_binary_reference(
-    prims.igammac,
+    prims.igammac,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
     supports_lhs_python_scalar=False,
     supports_rhs_python_scalar=False,
@@ -1316,7 +1346,7 @@ def _lcm(a: TensorLikeType, b: TensorLikeType):
 
 # TODO: add docstring
 le = _make_elementwise_binary_reference(
-    prims.le,
+    prims.le,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
     supports_lhs_python_scalar=False,
 )
@@ -1379,39 +1409,39 @@ def _logical_xor(a: TensorLikeType, b: TensorLikeType):
 
 # TODO: add docstring
 lt = _make_elementwise_binary_reference(
-    prims.lt,
+    prims.lt,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
     supports_lhs_python_scalar=False,
 )
 
 # TODO: add docstring
 maximum = _make_elementwise_binary_reference(
-    prims.maximum,
+    prims.maximum,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
 )
 
 # TODO: add docstring
 minimum = _make_elementwise_binary_reference(
-    prims.minimum,
+    prims.minimum,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
 )
 
 # TODO: add docstring
 mul = _make_elementwise_binary_reference(
-    prims.mul,
+    prims.mul,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
 )
 
 # TODO: add docstring
 ne = _make_elementwise_binary_reference(
-    prims.ne,
+    prims.ne,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
     supports_lhs_python_scalar=False,
 )
 
 # TODO: add docstring
 nextafter = _make_elementwise_binary_reference(
-    prims.nextafter,
+    prims.nextafter,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH,
     supports_lhs_python_scalar=False,
     supports_rhs_python_scalar=False,
@@ -1419,7 +1449,7 @@ def _logical_xor(a: TensorLikeType, b: TensorLikeType):
 
 # TODO: add docstring
 remainder = _make_elementwise_binary_reference(
-    prims.remainder,
+    prims.remainder,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
     aten_op=torch.ops.aten.remainder,
 )
@@ -1480,7 +1510,7 @@ def sub(
 
 # TODO: add docstring
 true_divide = _make_elementwise_binary_reference(
-    prims.div,
+    prims.div,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
     aten_op=None,  # CompositeImplicitAutograd
 )
@@ -2187,7 +2217,11 @@ def broadcast_shapes(*shapes) -> ShapeType:
     return torch.Size(_broadcast_shapes(*shapes))
 
 
+@torch.ops.aten.broadcast_tensors.default.py_impl(DispatchKey.CompositeImplicitAutograd)
+@torch.ops.aten.broadcast_tensors.default.py_impl(DispatchKey.Meta)
 def broadcast_tensors(*tensors) -> List[TensorLikeType]:
+    if len(tensors) == 1 and not isinstance(tensors[0], Tensor):
+        tensors = tensors[0]
     return list(_maybe_broadcast(*tensors, preserve_cpu_scalar_tensors=False))
 
 
@@ -2243,7 +2277,7 @@ def column_stack(tensors: TensorSequenceType) -> TensorLikeType:
 
 
 def conj(input: TensorLikeType) -> TensorLikeType:
-    if not input.dtype.is_complex:
+    if not utils.is_complex_dtype(input.dtype):
         return input
     if input.is_sparse:
         return torch.conj_physical(input)
@@ -2381,6 +2415,11 @@ def expand(a: Tensor, *shape) -> Tensor:
     )
 
 
+# CompositeImplicitAutograd - don't register decomp
+def expand_as(a: Tensor, b: Tensor) -> Tensor:
+    return a.expand(b.shape)
+
+
 def chunk(a: TensorLikeType, chunks: int, dim: int = 0) -> Tuple[TensorLikeType, ...]:
     if chunks <= 0:
         msg = "Expected at least one chunk, but got {0}!".format(chunks)
@@ -2527,15 +2566,24 @@ def native_layer_norm(
         + ", but got input of size "
         + str(input.shape),
     )
+
+    input = input.contiguous()
+    if weight is not None:
+        weight = weight.contiguous()
+    if bias is not None:
+        bias = bias.contiguous()
+
     axis = input.ndim - normalized_ndim
     reduction_dims = list(range(axis, input.ndim))
     out, mean, rstd = _normalize(input, reduction_dims, eps)
+
     if weight is None and bias is not None:
         out = out + bias
     elif weight is not None and bias is None:
         out = out * weight
     elif weight is not None and bias is not None:
         out = out * weight + bias
+
     out = prims.convert_element_type(out, input.dtype)
     if input.device.type == "cpu":
         mean = prims.convert_element_type(mean, input.dtype)
@@ -2551,6 +2599,102 @@ def permute(a: TensorLikeType, dims: DimsSequenceType) -> TensorLikeType:
     return prims.transpose(a, _permutation)
 
 
+# Get the new shape and stride after applying unfold to an input tensor
+def _get_unfold_copy_shape_stride(
+    a_shape: ShapeType, a_stride: StrideType, dimension: int, size: int, step: int
+):
+    a_ndim = len(a_shape)
+    dimension = utils.canonicalize_dim(a_ndim, dimension)
+    max_size = 1 if a_ndim == 0 else a_shape[dimension]
+    last_stride = 1 if a_ndim == 0 else a_stride[dimension]
+
+    utils.check(
+        size <= max_size,
+        lambda: "Maximum size for tensor at dimension "
+        + str(dimension)
+        + " is "
+        + str(max_size)
+        + " but size is "
+        + str(size),
+    )
+
+    utils.check(
+        step > 0,
+        lambda: "Step is " + str(step) + " but must be > 0",
+    )
+
+    new_size = []
+    new_stride = []
+
+    for d, (dim_size, dim_stride) in enumerate(zip(a_shape, a_stride)):
+        if d == dimension:
+            new_size.append((dim_size - size) // step + 1)
+            new_stride.append(step * dim_stride)
+        else:
+            new_size.append(dim_size)
+            new_stride.append(dim_stride)
+    new_size.append(size)
+    new_stride.append(last_stride)
+    return new_size, new_stride
+
+
+@register_decomposition(torch.ops.aten.repeat)
+def repeat(a: Tensor, *repeat_shape) -> Tensor:
+    repeat_shape = utils.extract_shape_from_varargs(repeat_shape, validate=False)
+    utils.check(
+        len(repeat_shape) >= len(a.shape),
+        lambda: "repeat: Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor",
+    )
+
+    if len(repeat_shape) == 0:
+        return torch.clone(a)
+
+    num_new_dimensions = len(repeat_shape) - a.ndim
+    padded_shape = [1] * num_new_dimensions
+    for dim_size in a.shape:
+        padded_shape.append(dim_size)
+
+    target_shape = tuple(
+        padded_size * repeat_size
+        for padded_size, repeat_size in zip(padded_shape, repeat_shape)
+    )
+
+    # return an empty tensor if one of the repeat_shape dimensions is zero
+    if 0 in repeat_shape:
+        return torch.empty(
+            target_shape,
+            dtype=a.dtype,
+            device=a.device,
+            requires_grad=a.requires_grad,
+            memory_format=utils.suggest_memory_format(a),
+        )
+
+    urtensor_shape = target_shape
+    urtensor_stride = utils.make_contiguous_strides_for(target_shape)
+    for dim, dim_size in enumerate(padded_shape):
+        # repeat each dimension by using unfold_copy operation
+        urtensor_shape, urtensor_stride = _get_unfold_copy_shape_stride(
+            urtensor_shape, urtensor_stride, dim, dim_size, max(dim_size, 1)
+        )
+
+    # derive permute order by sorting urtensor strides
+    enumerated_stride = list(enumerate(urtensor_stride))
+    enumerated_stride.sort(key=lambda item: item[1], reverse=True)
+    permute_order, sorted_stride = zip(*enumerated_stride)
+
+    # add new and expand dimensions according to urtensor
+    repeat_xtensor = a.expand(urtensor_shape)
+
+    # clone tensor to concretize expanded dimensions
+    cloned_result = torch.clone(repeat_xtensor)
+
+    # transpose axis so strides are in sorted order
+    permuted_result = cloned_result.permute(permute_order)
+
+    # reshape to get contiguous tensor with correct target shape
+    return permuted_result.reshape(target_shape)
+
+
 def _reshape_view_helper(a: TensorLikeType, *shape, allow_copy: bool) -> TensorLikeType:
     # Creates a valid shape
     shape = utils.extract_shape_from_varargs(shape, validate=False)
@@ -2831,6 +2975,114 @@ def unbind(t: TensorLikeType, dim: int = 0) -> TensorSequenceType:
     )
 
 
+@register_decomposition(torch.ops.aten.index_copy)
+@out_wrapper()
+def index_copy(x: TensorLike, dim: int, index: TensorLike, tensor: TensorLike):
+    return x.clone().index_copy_(dim, index, tensor)
+
+
+@register_decomposition(torch.ops.aten.index_copy_)
+def index_copy_(x: TensorLike, dim: int, index: TensorLike, tensor: TensorLike):
+    dim = utils.canonicalize_dims(x.ndim, dim)
+    utils.check(
+        index.ndim <= 1,
+        lambda: f"Index should have dimension 1 or 0 (got {index.ndim})",
+    )
+    # Treat scalars as elements of \R^1
+    y = x.unsqueeze(0) if x.ndim == 0 else x
+    idx = (slice(None),) * dim + (index,)
+    y[idx] = tensor
+    return x
+
+
+@register_decomposition(torch.ops.aten.index_fill)
+def index_fill(
+    x: TensorLike, dim: int, index: TensorLike, value: Union[NumberType, TensorLike]
+):
+    return x.clone().index_fill_(dim, index, value)  # type: ignore[arg-type]
+
+
+@register_decomposition(torch.ops.aten.index_fill_)
+def index_fill_(
+    x: TensorLike, dim: int, index: TensorLike, value: Union[NumberType, TensorLike]
+):
+    if isinstance(value, TensorLike):
+        utils.check(
+            value.ndim == 0,
+            lambda: "Only supports 0-dimensional value tensor. "  # type: ignore[union-attr]
+            f"Got a tensor with {value.ndim} dimensions.",
+        )  # type: ignore[arg-type]
+        return x.clone().index_copy_(dim, index, value)
+    dim = utils.canonicalize_dims(x.ndim, dim)
+    utils.check(
+        index.ndim <= 1,
+        lambda: f"Index should have dimension 1 or 0 (got {index.ndim})",
+    )
+    idx = (slice(None),) * dim + (index,)
+    # Treat scalars as elements of \R^1
+    y = x.unsqueeze(0) if x.ndim == 0 else x
+    y[idx] = value  # type: ignore[assignment]
+    return x
+
+
+@register_decomposition(torch.ops.aten.index_add)
+@out_wrapper()
+def index_add(
+    x: TensorLike,
+    dim: int,
+    index: TensorLike,
+    tensor: TensorLike,
+    *,
+    alpha: NumberType = 1,
+):
+    return x.clone().index_add_(dim, index, tensor, alpha=alpha)  # type: ignore[arg-type]
+
+
+# The decomposition of this function dispatches to aten.index_put_ for efficiency
+# We cannot do that in Python, as torch.index_put_ does not support slice(None)s See
+# https://github.com/pytorch/pytorch/pull/85002#issuecomment-1248524492
+def index_add_(
+    x: TensorLike,
+    dim: int,
+    index: TensorLike,
+    tensor: TensorLike,
+    *,
+    alpha: NumberType = 1,
+):
+    dim = utils.canonicalize_dims(x.ndim, dim)
+    utils.check(
+        index.ndim <= 1,
+        lambda: f"Index should have dimension 1 or 0 (got {index.ndim})",
+    )
+    if alpha != 1:
+        python_type = utils.dtype_to_type(x.dtype)
+        utils.check(
+            utils.is_weakly_lesser_type(type(alpha), python_type),
+            lambda: f"alpha argument of type {type(alpha)} cannot be safely cast to type {python_type}!",
+        )
+        tensor = prims.mul(tensor, alpha)
+    # Treat scalars as elements of \R^1
+    y = x.unsqueeze(0) if x.ndim == 0 else x
+    idx = (slice(None),) * dim + (index,)
+    y[idx] += tensor
+    return x
+
+
+@register_decomposition(torch.ops.aten.index_select, disable_meta=True)
+@out_wrapper()
+def index_select(x: TensorLike, dim: int, index: TensorLike):
+    dim = utils.canonicalize_dims(x.ndim, dim)
+    utils.check(
+        index.ndim <= 1,
+        lambda: f"Index should have dimension 1 or 0 (got {index.ndim})",
+    )
+    # Treat scalars as elements of \R^1
+    if x.ndim == 0:
+        return x.unsqueeze(0)[index].squeeze(0).clone()
+    idx = (slice(None),) * dim + (index,)
+    return x[idx]
+
+
 # Note: although squeeze is documented as having the out= kwarg it doesn't
 @register_decomposition(torch.ops.aten.squeeze, disable_meta=True)
 def squeeze(a: TensorLikeType, dim: Optional[int] = None) -> TensorLikeType:
@@ -3157,6 +3409,14 @@ def transpose(a: TensorLikeType, dim0: int, dim1: int) -> TensorLikeType:
 swap_axes = transpose
 
 
+@register_decomposition(torch.ops.aten.unfold_copy)
+def unfold_copy(a: TensorLikeType, dimension: int, size: int, step: int):
+    new_size, new_stride = _get_unfold_copy_shape_stride(
+        a.shape, a.stride(), dimension, size, step
+    )
+    return a.as_strided(new_size, new_stride)
+
+
 @register_decomposition(torch.ops.aten.unsqueeze, disable_meta=True)
 def unsqueeze(a: TensorLikeType, dim: int) -> TensorLikeType:
     # Note that unsqueeze canonicalizes with rank + 1 because it allows
@@ -3185,6 +3445,7 @@ def ravel(a: TensorLikeType) -> TensorLikeType:
     return reshape(a, (-1,))
 
 
+@register_decomposition(torch.ops.aten.empty)
 @out_wrapper()
 def empty(
     *shape,
@@ -3275,6 +3536,31 @@ def new_empty_strided(
     )
 
 
+@register_decomposition(torch.ops.aten.zeros)
+@out_wrapper()
+def zeros(
+    size: ShapeType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    layout: Optional[torch.layout] = None,
+    device: Optional[torch.device] = None,
+    pin_memory: bool = False,
+    requires_grad: bool = False,
+) -> TensorLikeType:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+
+    return torch.full(
+        size,
+        False if dtype == torch.bool else 0,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        requires_grad=requires_grad,
+    )
+
+
 @register_decomposition(torch.ops.aten.new_zeros)
 def new_zeros(
     a: TensorLikeType,
@@ -3284,12 +3570,45 @@ def new_zeros(
     layout: Optional[torch.layout] = None,
     device: Optional[torch.device] = None,
     pin_memory: bool = False,
+    requires_grad: bool = False,
 ) -> TensorLikeType:
-    r = a.new_empty(
-        size, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    dtype = a.dtype if dtype is None else dtype
+    layout = a.layout if layout is None else layout
+    device = a.device if device is None else device
+
+    return torch.full(
+        size,
+        False if dtype == torch.bool else 0,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        requires_grad=requires_grad,
+    )
+
+
+@register_decomposition(torch.ops.aten.ones)
+def ones(
+    size: ShapeType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    layout: Optional[torch.layout] = None,
+    device: Optional[torch.device] = None,
+    pin_memory: bool = False,
+    requires_grad: bool = False,
+) -> TensorLikeType:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+
+    return torch.full(
+        size,
+        True if dtype == torch.bool else 1,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        requires_grad=requires_grad,
     )
-    r.zero_()
-    return r
 
 
 @register_decomposition(torch.ops.aten.new_ones)
@@ -3301,32 +3620,49 @@ def new_ones(
     layout: Optional[torch.layout] = None,
     device: Optional[torch.device] = None,
     pin_memory: bool = False,
+    requires_grad: bool = False,
 ) -> TensorLikeType:
-    r = a.new_empty(
-        size, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    dtype = a.dtype if dtype is None else dtype
+    layout = a.layout if layout is None else layout
+    device = a.device if device is None else device
+
+    return torch.full(
+        size,
+        True if dtype == torch.bool else 1,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        requires_grad=requires_grad,
     )
-    r.fill_(1)
-    return r
 
 
 @register_decomposition(torch.ops.aten.new_full)
 def new_full(
     a: TensorLikeType,
     size: ShapeType,
-    fill_value: NumberType,
+    fill_value: Union[int, float, bool],
     *,
     dtype: Optional[torch.dtype] = None,
     layout: Optional[torch.layout] = None,
     device: Optional[torch.device] = None,
     pin_memory: bool = False,
 ) -> TensorLikeType:
-    r = a.new_empty(
-        size, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    dtype = a.dtype if dtype is None else dtype
+    layout = a.layout if layout is None else layout
+    device = a.device if device is None else device
+
+    return torch.full(
+        size,
+        fill_value,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
     )
-    r.fill_(fill_value)  # type: ignore[arg-type]
-    return r
 
 
+@register_decomposition(torch.ops.aten.empty_like)
 def empty_like(
     a: TensorLikeType,
     *,
@@ -3763,25 +4099,39 @@ def eye(
     if dtype is torch.bool:
         return cond
     else:
-        # TODO: pin_memory=pin_memory, layout=layout
-        one = torch.ones(1, dtype=dtype, device=device, requires_grad=False)
+        one = torch.ones(
+            (1,),
+            dtype=dtype,
+            layout=layout,
+            device=device,
+            pin_memory=pin_memory,
+            requires_grad=False,
+        )
         return torch.where(cond, one, 0)
     # TODO: Use requires_grad.  All refs taking the requires_grad kwarg must
     # return a leaf tensor.
     # result.requires_grad_(requires_grad)
 
 
-# TODO: missing kwargs (e.g. layout)
 @out_wrapper()
 def full(
     shape: ShapeType,
     fill_value: NumberType,
     *,
-    dtype: torch.dtype,
-    device: torch.device,
-    requires_grad: bool,
+    dtype: Optional[torch.dtype] = None,
+    layout: Optional[torch.layout] = None,
+    device: Optional[torch.device] = None,
+    pin_memory: bool = False,
+    requires_grad: bool = False,
 ) -> TensorLikeType:
-    e = empty(shape, dtype=dtype, device=device, requires_grad=requires_grad)
+    e = empty(
+        shape,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        requires_grad=requires_grad,
+    )
     return fill(e, fill_value)
 
 
@@ -3797,8 +4147,6 @@ def full_like(
     return fill(e, fill_value)
 
 
-ones = partial(full, fill_value=True)
-
 ones_like = partial(full_like, fill_value=True)
 
 
@@ -3814,8 +4162,6 @@ def scalar_tensor(
     return prims.scalar_tensor(a, dtype=dtype, device=device)
 
 
-zeros = partial(full, fill_value=False)
-
 zeros_like = partial(full_like, fill_value=False)
 
 
diff --git a/torch/_refs/special/__init__.py b/torch/_refs/special/__init__.py
index 1c75522d9be1..7eec32a17a5f 100644
--- a/torch/_refs/special/__init__.py
+++ b/torch/_refs/special/__init__.py
@@ -61,7 +61,7 @@ def logit(self: TensorLikeType, eps: Optional[float] = None) -> TensorLikeType:
 
 
 zeta = _make_elementwise_binary_reference(
-    prims.zeta,
+    prims.zeta,  # type: ignore[has-type]
     type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
     aten_op=torch.ops.aten.special_zeta,
 )
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index c325f054bd54..596c62a86b9b 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -1,6 +1,7 @@
 import contextlib
 import functools
 import itertools
+import sys
 import warnings
 import weakref
 from dataclasses import dataclass
@@ -266,6 +267,11 @@ def impl_decorator(op_impl):
     return impl_decorator
 
 
+@register_op_impl(aten._efficientzerotensor.default)
+def efficient_zero(fake_mode, func, *args, **kwargs):
+    return constructors(fake_mode, aten.zeros.default, *args, **kwargs)
+
+
 @register_op_impl(
     lambda func: (_is_tensor_constructor(func) or func in _like_tensor_constructors)
 )
@@ -494,6 +500,8 @@ def __repr__(self):
         return f"FakeTensor({self_repr}, {self.fake_device})"
 
     def new(self, *args, **kwargs):
+        # TODO: This doesn't work with sparse self
+
         # torch.Tensor.new does not go through the normal dispatcher pattern
         # so in order to use the same pattern as normal invocation of
         # returning meta device within the kernel we need to intercept
@@ -502,7 +510,7 @@ def new(self, *args, **kwargs):
         # when attempting to compute an output in meta, so
         # we compute the real tensor then convert to meta
         out_device = self.fake_device
-        with no_dispatch():
+        with no_dispatch(), in_kernel_invocation_manager(self.fake_mode):
             real_out = super().new(*args, **kwargs)
 
         assert not isinstance(real_out, FakeTensor), real_out
@@ -658,7 +666,7 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 return args[0].fake_device
 
         flat_arg_tensors = tree_flatten_only(FakeTensor, (args, kwargs))
-        flat_symints = tree_flatten_only(torch._C.SymIntNode, (args, kwargs))
+        flat_symints = tree_flatten_only(torch.SymIntNode, (args, kwargs))
         has_symbolic_sizes = (
             any([i.has_sym_ints for i in flat_arg_tensors]) or len(flat_symints) > 0
         )
@@ -718,20 +726,31 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         # is written to must be invalidated
         self.invalidate_written_to_constants(func, flat_arg_tensors, args, kwargs)
 
-        if has_symbolic_sizes:
+        functions_with_cpp_meta_impl_that_support_symint = [
+            aten.empty_strided.default,
+            aten.as_strided.default,
+            aten.zeros.default,
+            aten.clone.default,
+            aten.detach.default,
+        ]
+        # IDK: feels bad man, sym_numel on as_strided infinite loops otherwise
+        if (
+            has_symbolic_sizes
+            and func not in functions_with_cpp_meta_impl_that_support_symint
+        ):
             # TODO: Find better approach for this
             # Avoid circular import
             from torch._decomp import decomposition_table
             from torch._meta_registrations import meta_table
 
             with no_dispatch():
-                if symbolic_shapes.is_symbolic_op(func):
-                    return symbolic_shapes.handle_symbolic_op(func, args, kwargs)
                 if func == aten.size.default:
-                    raise RuntimeError(
+                    sys.stderr.write(
                         "Trying to call aten.size on a tensor with symbolic shapes. "
                         "It's likely that this is from calling tensor.shape in C++"
                     )
+                    # We do this to allow for better error localization with `TORCH_SHOW_CPP_STACKTRACES=1`
+                    return None
 
             with self.restore():
                 if func in meta_table:
@@ -759,8 +778,7 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 return func.prim_meta_impl(*args, **kwargs)
 
         if has_symbolic_sizes:
-            constructors = [aten.empty.memory_format]
-            if func not in constructors:
+            if func not in functions_with_cpp_meta_impl_that_support_symint:
                 raise RuntimeError(
                     f"{func} - couldn't find symbolic meta function/decomposition"
                 )
diff --git a/torch/_tensor.py b/torch/_tensor.py
index 8db059ea7085..03bf8aa56498 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -28,8 +28,6 @@
 
 
 def _handle_torch_function_and_wrap_type_error_to_not_implemented(f):
-    # functools.wraps doesn't work well with methods in python 2
-    method_assignments = ("__name__", "__doc__")
     assigned = functools.WRAPPER_ASSIGNMENTS
 
     @functools.wraps(f, assigned=assigned)
@@ -638,6 +636,11 @@ def solve(self, other):
 
         return solve(self, other)
 
+    def eig(self, eigenvectors=False):
+        from ._linalg_utils import eig
+
+        return eig(self, eigenvectors=eigenvectors)
+
     def lu(self, pivot=True, get_infos=False):
         r"""See :func:`torch.lu`"""
         # If get_infos is True, then we don't need to check for errors and vice versa
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 3380942c0287..4e438a640763 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -1692,15 +1692,6 @@ def add_docstr_all(method, docstr):
 """,
 )
 
-add_docstr_all(
-    "eig",
-    r"""
-eig(eigenvectors=False) -> (Tensor, Tensor)
-
-See :func:`torch.eig`
-""",
-)
-
 add_docstr_all(
     "element_size",
     r"""
@@ -6511,55 +6502,3 @@ def callable(a, b) -> number
 
 """,
 )
-
-add_docstr_all(
-    "to_padded_tensor",
-    r"""
-to_padded_tensor(padding, output_size=None) -> Tensor
-
-Returns a new (non-nested) Tensor by padding the nested tensor.
-The leading entries will be filled with the nested data,
-while the trailing entries will be padded.
-
-.. warning::
-
-    :func:`to_padded_tensor` always copies the underlying data,
-    since the nested and the non-nested tensors differ in memory layout.
-
-Args:
-    padding (float): The padding value for the trailing entries.
-    output_size (Tuple[int]): The size of the output tensor.
-                              If given, it must be large enough to contain all nested data;
-                              else, will infer by taking the max size of each nested sub-tensor along each dimension.
-
-Example::
-
-    >>> nt = torch.nested_tensor([torch.randn((2, 5)), torch.randn((3, 4))])
-    nested_tensor([
-      tensor([[ 1.6862, -1.1282,  1.1031,  0.0464, -1.3276],
-              [-1.9967, -1.0054,  1.8972,  0.9174, -1.4995]]),
-      tensor([[-1.8546, -0.7194, -0.2918, -0.1846],
-              [ 0.2773,  0.8793, -0.5183, -0.6447],
-              [ 1.8009,  1.8468, -0.9832, -1.5272]])
-    ])
-    >>> pt_infer = nt.to_padded_tensor(0.0)
-    tensor([[[ 1.6862, -1.1282,  1.1031,  0.0464, -1.3276],
-             [-1.9967, -1.0054,  1.8972,  0.9174, -1.4995],
-             [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],
-            [[-1.8546, -0.7194, -0.2918, -0.1846,  0.0000],
-             [ 0.2773,  0.8793, -0.5183, -0.6447,  0.0000],
-             [ 1.8009,  1.8468, -0.9832, -1.5272,  0.0000]]])
-    >>> pt_large = nt.to_padded_tensor(1.0, (2, 4, 6))
-    tensor([[[ 1.6862, -1.1282,  1.1031,  0.0464, -1.3276,  1.0000],
-             [-1.9967, -1.0054,  1.8972,  0.9174, -1.4995,  1.0000],
-             [ 1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000],
-             [ 1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000]],
-            [[-1.8546, -0.7194, -0.2918, -0.1846,  1.0000,  1.0000],
-             [ 0.2773,  0.8793, -0.5183, -0.6447,  1.0000,  1.0000],
-             [ 1.8009,  1.8468, -0.9832, -1.5272,  1.0000,  1.0000],
-             [ 1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000]]])
-    >>> pt_small = nt.to_padded_tensor(2.0, (2, 2, 2))
-    RuntimeError: Value in output_size is less than NestedTensor padded size. Truncation is not supported.
-
-""",
-)
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 9941367b7913..5099bb6231ff 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -2310,6 +2310,15 @@ def merge_dicts(*dicts):
 """,
 )
 
+add_docstr(
+    torch.concatenate,
+    r"""
+concatenate(tensors, axis=0, out=None) -> Tensor
+
+Alias of :func:`torch.cat`.
+""",
+)
+
 add_docstr(
     torch.ceil,
     r"""
@@ -3999,92 +4008,6 @@ def merge_dicts(*dicts):
 """,
 )
 
-add_docstr(
-    torch.eig,
-    r"""
-eig(input, eigenvectors=False, *, out=None) -> (Tensor, Tensor)
-
-Computes the eigenvalues and eigenvectors of a real square matrix.
-
-.. note::
-    Since eigenvalues and eigenvectors might be complex, backward pass is supported only
-    if eigenvalues and eigenvectors are all real valued.
-
-    When :attr:`input` is on CUDA, :func:`torch.eig() <torch.eig>` causes
-    host-device synchronization.
-
-.. warning::
-
-    :func:`torch.eig` is deprecated in favor of :func:`torch.linalg.eig`
-    and will be removed in a future PyTorch release.
-    :func:`torch.linalg.eig` returns complex tensors of dtype `cfloat` or `cdouble`
-    rather than real tensors mimicking complex tensors.
-
-    ``L, _ = torch.eig(A)`` should be replaced with
-
-    .. code :: python
-
-        L_complex = torch.linalg.eigvals(A)
-
-    ``L, V = torch.eig(A, eigenvectors=True)`` should be replaced with
-
-    .. code :: python
-
-        L_complex, V_complex = torch.linalg.eig(A)
-
-Args:
-    input (Tensor): the square matrix of shape :math:`(n \times n)` for which the eigenvalues and eigenvectors
-        will be computed
-    eigenvectors (bool): ``True`` to compute both eigenvalues and eigenvectors;
-        otherwise, only eigenvalues will be computed
-
-Keyword args:
-    out (tuple, optional): the output tensors
-
-Returns:
-    (Tensor, Tensor): A namedtuple (eigenvalues, eigenvectors) containing
-
-        - **eigenvalues** (*Tensor*): Shape :math:`(n \times 2)`. Each row is an eigenvalue of ``input``,
-          where the first element is the real part and the second element is the imaginary part.
-          The eigenvalues are not necessarily ordered.
-        - **eigenvectors** (*Tensor*): If ``eigenvectors=False``, it's an empty tensor.
-          Otherwise, this tensor of shape :math:`(n \times n)` can be used to compute normalized (unit length)
-          eigenvectors of corresponding eigenvalues as follows.
-          If the corresponding `eigenvalues[j]` is a real number, column `eigenvectors[:, j]` is the eigenvector
-          corresponding to `eigenvalues[j]`.
-          If the corresponding `eigenvalues[j]` and `eigenvalues[j + 1]` form a complex conjugate pair, then the
-          true eigenvectors can be computed as
-          :math:`\text{true eigenvector}[j] = eigenvectors[:, j] + i \times eigenvectors[:, j + 1]`,
-          :math:`\text{true eigenvector}[j + 1] = eigenvectors[:, j] - i \times eigenvectors[:, j + 1]`.
-
-Example::
-
-    Trivial example with a diagonal matrix. By default, only eigenvalues are computed:
-
-    >>> a = torch.diag(torch.tensor([1, 2, 3], dtype=torch.double))
-    >>> e, v = torch.eig(a)
-    >>> e
-    tensor([[1., 0.],
-            [2., 0.],
-            [3., 0.]], dtype=torch.float64)
-    >>> v
-    tensor([], dtype=torch.float64)
-
-    Compute also the eigenvectors:
-
-    >>> e, v = torch.eig(a, eigenvectors=True)
-    >>> e
-    tensor([[1., 0.],
-            [2., 0.],
-            [3., 0.]], dtype=torch.float64)
-    >>> v
-    tensor([[1., 0., 0.],
-            [0., 1., 0.],
-            [0., 0., 1.]], dtype=torch.float64)
-
-""",
-)
-
 add_docstr(
     torch.eq,
     r"""
@@ -8158,7 +8081,7 @@ def merge_dicts(*dicts):
 Args:
     input (Tensor): the tensor to narrow
     dim (int): the dimension along which to narrow
-    start (int): the starting dimension
+    start (Tensor or int): the starting dimension
     length (int): the distance to the ending dimension
 
 Example::
diff --git a/torch/ao/nn/quantized/_reference/__init__.py b/torch/ao/nn/quantized/_reference/__init__.py
deleted file mode 100644
index 3d79bdbfe832..000000000000
--- a/torch/ao/nn/quantized/_reference/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .modules import *  # noqa: F403
diff --git a/torch/ao/nn/quantized/reference/__init__.py b/torch/ao/nn/quantized/reference/__init__.py
new file mode 100644
index 000000000000..833bf9577a1a
--- /dev/null
+++ b/torch/ao/nn/quantized/reference/__init__.py
@@ -0,0 +1,17 @@
+from .modules import *  # noqa: F403
+
+__all__ = [
+    'Linear',
+    'Conv1d',
+    'Conv2d',
+    'Conv3d',
+    'ConvTranspose1d',
+    'ConvTranspose2d',
+    'ConvTranspose3d',
+    'RNNCell',
+    'LSTMCell',
+    'GRUCell',
+    'LSTM',
+    'Embedding',
+    'EmbeddingBag',
+]
diff --git a/torch/ao/nn/quantized/_reference/modules/__init__.py b/torch/ao/nn/quantized/reference/modules/__init__.py
similarity index 100%
rename from torch/ao/nn/quantized/_reference/modules/__init__.py
rename to torch/ao/nn/quantized/reference/modules/__init__.py
diff --git a/torch/ao/nn/quantized/_reference/modules/conv.py b/torch/ao/nn/quantized/reference/modules/conv.py
similarity index 99%
rename from torch/ao/nn/quantized/_reference/modules/conv.py
rename to torch/ao/nn/quantized/reference/modules/conv.py
index 64b31c0c75a7..910223056fba 100644
--- a/torch/ao/nn/quantized/_reference/modules/conv.py
+++ b/torch/ao/nn/quantized/reference/modules/conv.py
@@ -5,6 +5,8 @@
 from torch.nn.common_types import _size_1_t
 from .utils import ReferenceQuantizedModule
 
+__all__ = ['Conv1d', 'Conv2d', 'Conv3d', 'ConvTranspose1d', 'ConvTranspose2d', 'ConvTranspose3d']
+
 class _ConvNd(torch.nn.modules.conv._ConvNd, ReferenceQuantizedModule):
     """ A reference version of nn.quantized.Conv2d
         we will not pack the parameters in this module, since weight packing is an
diff --git a/torch/ao/nn/quantized/_reference/modules/linear.py b/torch/ao/nn/quantized/reference/modules/linear.py
similarity index 98%
rename from torch/ao/nn/quantized/_reference/modules/linear.py
rename to torch/ao/nn/quantized/reference/modules/linear.py
index adbcba39685f..378fe0eb6eee 100644
--- a/torch/ao/nn/quantized/_reference/modules/linear.py
+++ b/torch/ao/nn/quantized/reference/modules/linear.py
@@ -4,6 +4,8 @@
 from typing import Optional, Dict, Any
 from .utils import ReferenceQuantizedModule
 
+__all__ = ['Linear']
+
 class Linear(nn.Linear, ReferenceQuantizedModule):
     """ A reference quantized linear module that fits into the FX
     Graph Mode Quantization workflow
diff --git a/torch/ao/nn/quantized/_reference/modules/rnn.py b/torch/ao/nn/quantized/reference/modules/rnn.py
similarity index 96%
rename from torch/ao/nn/quantized/_reference/modules/rnn.py
rename to torch/ao/nn/quantized/reference/modules/rnn.py
index 7280170a1b5c..a6eaed977449 100644
--- a/torch/ao/nn/quantized/_reference/modules/rnn.py
+++ b/torch/ao/nn/quantized/reference/modules/rnn.py
@@ -7,10 +7,12 @@
 from torch import _VF
 from torch.nn.utils.rnn import PackedSequence
 
-def apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
+__all__ = ['RNNCellBase', 'RNNCell', 'LSTMCell', 'GRUCell', 'RNNBase', 'LSTM', 'get_quantized_weight']
+
+def _apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
     return tensor.index_select(dim, permutation)
 
-def get_weight_and_quantization_params(module, wn):
+def _get_weight_and_quantization_params(module, wn):
     weight = getattr(module, wn)
     params = [weight]
     for param_name in [wn + n for n in ["_qscheme", "_dtype", "_scale", "_zero_point", "_axis"]]:
@@ -24,14 +26,14 @@ def get_weight_and_quantization_params(module, wn):
 def get_quantized_weight(module, wn):
     if not hasattr(module, wn):
         return None
-    params = get_weight_and_quantization_params(module, wn)
+    params = _get_weight_and_quantization_params(module, wn)
     weight = _quantize_weight(*params)
     return weight
 
-def get_quantize_and_dequantized_weight(module, wn):
+def _get_quantize_and_dequantized_weight(module, wn):
     if not hasattr(module, wn):
         return None
-    params = get_weight_and_quantization_params(module, wn)
+    params = _get_weight_and_quantization_params(module, wn)
     weight = _quantize_and_dequantize_weight(*params)
     return weight
 
@@ -95,10 +97,10 @@ def get_quantized_weight_hh(self):
         return get_quantized_weight(self, "weight_hh")
 
     def get_weight_ih(self):
-        return get_quantize_and_dequantized_weight(self, "weight_ih")
+        return _get_quantize_and_dequantized_weight(self, "weight_ih")
 
     def get_weight_hh(self):
-        return get_quantize_and_dequantized_weight(self, "weight_hh")
+        return _get_quantize_and_dequantized_weight(self, "weight_hh")
 
 class RNNCell(RNNCellBase):
     """
@@ -334,7 +336,7 @@ def permute_hidden(self,  # type: ignore[override]
                        ) -> Tuple[Tensor, Tensor]:
         if permutation is None:
             return hx
-        return apply_permutation(hx[0], permutation), apply_permutation(hx[1], permutation)
+        return _apply_permutation(hx[0], permutation), _apply_permutation(hx[1], permutation)
 
     def get_expected_cell_size(self, input: Tensor, batch_sizes: Optional[Tensor]) -> Tuple[int, int, int]:
         if batch_sizes is not None:
@@ -386,7 +388,7 @@ def get_flat_weights(self):
             if hasattr(self, wn):
                 weight = getattr(self, wn)
                 if wn.startswith("weight"):
-                    params = get_weight_and_quantization_params(self, wn)
+                    params = _get_weight_and_quantization_params(self, wn)
                     weight = _quantize_and_dequantize_weight(*params)
             else:
                 weight = None
diff --git a/torch/ao/nn/quantized/_reference/modules/sparse.py b/torch/ao/nn/quantized/reference/modules/sparse.py
similarity index 99%
rename from torch/ao/nn/quantized/_reference/modules/sparse.py
rename to torch/ao/nn/quantized/reference/modules/sparse.py
index 5ace87f0fb73..4890402b875a 100644
--- a/torch/ao/nn/quantized/_reference/modules/sparse.py
+++ b/torch/ao/nn/quantized/reference/modules/sparse.py
@@ -4,6 +4,8 @@
 from .utils import ReferenceQuantizedModule
 from typing import Optional, Dict, Any
 
+__all__ = ['Embedding', 'EmbeddingBag']
+
 class Embedding(nn.Embedding, ReferenceQuantizedModule):
     """ A reference quantized Embedding module that fits into the
     FX Graph Mode Quantization workflow, activation will be floating point Tensor,
diff --git a/torch/ao/nn/quantized/_reference/modules/utils.py b/torch/ao/nn/quantized/reference/modules/utils.py
similarity index 99%
rename from torch/ao/nn/quantized/_reference/modules/utils.py
rename to torch/ao/nn/quantized/reference/modules/utils.py
index f9cd0b7dcb21..6af9e1f04eb5 100644
--- a/torch/ao/nn/quantized/_reference/modules/utils.py
+++ b/torch/ao/nn/quantized/reference/modules/utils.py
@@ -1,5 +1,5 @@
 import torch
-from typing import Dict, Any
+import typing
 
 class ReferenceQuantizedModule(torch.nn.Module):
     def _init_weight_qparams(self, weight_qparams, device):
@@ -148,7 +148,7 @@ def _save_weight_qparams(destination, prefix, weight_qscheme, weight_dtype, weig
             destination[prefix + "weight_axis"] = weight_axis
 
 def _get_weight_qparam_keys(
-        state_dict: Dict[str, Any],
+        state_dict: typing.Dict[str, typing.Any],
         prefix: str):
     keys = ["weight_qscheme", "weight_dtype"]
     weight_qscheme = state_dict[prefix + "weight_qscheme"]
diff --git a/torch/ao/quantization/backend_config/_common_operator_config_utils.py b/torch/ao/quantization/backend_config/_common_operator_config_utils.py
index d1552f58fce4..baf468648e9a 100644
--- a/torch/ao/quantization/backend_config/_common_operator_config_utils.py
+++ b/torch/ao/quantization/backend_config/_common_operator_config_utils.py
@@ -341,6 +341,21 @@ def _get_cat_config(dtype_configs: List[DTypeConfig]) -> BackendPatternConfig:
         .set_observation_type(ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT) \
         .set_dtype_configs(dtype_configs)
 
+def _get_ln_configs(dtype_configs: List[DTypeConfig]) -> List[BackendPatternConfig]:
+    ln_configs = []
+    ln_configs.append(
+        BackendPatternConfig(torch.nn.LayerNorm)
+        .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
+        .set_dtype_configs(dtype_configs)
+    )
+    ln_configs.append(
+        BackendPatternConfig(torch.nn.functional.layer_norm)
+        .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
+        .set_dtype_configs(dtype_configs)
+        ._set_input_type_to_index({"weight": 2, "bias": 3})
+    )
+    return ln_configs
+
 def _get_default_op_configs(dtype_configs: List[DTypeConfig]) -> List[BackendPatternConfig]:
     configs = []
     default_ops = [
@@ -350,7 +365,6 @@ def _get_default_op_configs(dtype_configs: List[DTypeConfig]) -> List[BackendPat
         torch.nn.InstanceNorm1d,
         torch.nn.InstanceNorm2d,
         torch.nn.InstanceNorm3d,
-        torch.nn.LayerNorm,
         torch.nn.Dropout,
         torch.nn.PReLU,
         torch.nn.functional.elu,
@@ -364,13 +378,6 @@ def _get_default_op_configs(dtype_configs: List[DTypeConfig]) -> List[BackendPat
                 .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
                 .set_dtype_configs(dtype_configs))
 
-    configs.append(
-        BackendPatternConfig(torch.nn.functional.layer_norm)
-        .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
-        .set_dtype_configs(dtype_configs)
-        ._set_input_type_to_index({"weight": 2, "bias": 3})
-    )
-
     configs.append(
         BackendPatternConfig(torch.nn.functional.group_norm)
         .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
diff --git a/torch/ao/quantization/backend_config/backend_config.py b/torch/ao/quantization/backend_config/backend_config.py
index 223fc5ae39bc..a2bda2250d5f 100644
--- a/torch/ao/quantization/backend_config/backend_config.py
+++ b/torch/ao/quantization/backend_config/backend_config.py
@@ -47,14 +47,21 @@
 # TODO: maybe rename this to something that's not related to observer
 # e.g. QParamsType
 class ObservationType(Enum):
-    # this means input and output are observed with different observers, based
-    # on qconfig.activation
-    # example: conv, linear, softmax
+    """ An enum that represents different ways of how an operator/operator pattern
+    should be observed
+    """
+
     OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT = 0
-    # this means the output will use the same observer instance as input, based
-    # on qconfig.activation
-    # example: torch.cat, maxpool
+    """this means input and output are observed with different observers, based
+    on qconfig.activation
+    example: conv, linear, softmax
+    """
+
     OUTPUT_SHARE_OBSERVER_WITH_INPUT = 1
+    """this means the output will use the same observer instance as input, based
+    on qconfig.activation
+    example: torch.cat, maxpool
+    """
 
 @dataclass
 class DTypeConfig:
@@ -71,7 +78,7 @@ class DTypeConfig:
     @classmethod
     def from_dict(cls, dtype_config_dict: Dict[str, Any]) -> DTypeConfig:
         """
-        Create a `DTypeConfig` from a dictionary with the following items (all optional):
+        Create a ``DTypeConfig`` from a dictionary with the following items (all optional):
 
             "input_dtype": torch.dtype
             "output_dtype": torch.dtype
@@ -88,7 +95,7 @@ def from_dict(cls, dtype_config_dict: Dict[str, Any]) -> DTypeConfig:
 
     def to_dict(self) -> Dict[str, Any]:
         """
-        Convert this `DTypeConfig` to a dictionary with the items described in
+        Convert this ``DTypeConfig`` to a dictionary with the items described in
         :func:`~torch.ao.quantization.backend_config.DTypeConfig.from_dict`.
         """
         dtype_config_dict: Dict[str, Any] = {}
@@ -107,16 +114,18 @@ def to_dict(self) -> Dict[str, Any]:
 
 class BackendConfig:
     # TODO: refer to NativeBackendConfig once that is implemented
-    """
-    Config that defines the set of patterns that can be quantized on a given backend, and how reference
+    """Config that defines the set of patterns that can be quantized on a given backend, and how reference
     quantized models can be produced from these patterns.
 
     A pattern in this context refers to a module, a functional, an operator, or a directed acyclic graph
     of the above. Each pattern supported on the target backend can be individually configured through
     :class:`~torch.ao.quantization.backend_config.BackendPatternConfig` in terms of:
-        (1) The supported input/output activation, weight, and bias data types
-        (2) How observers and quant/dequant ops are inserted in order to construct the reference pattern, and
-        (3) (Optionally) Fusion, QAT, and reference module mappings.
+
+    (1) The supported input/output activation, weight, and bias data types
+
+    (2) How observers and quant/dequant ops are inserted in order to construct the reference pattern, and
+
+    (3) (Optionally) Fusion, QAT, and reference module mappings.
 
     The format of the patterns is described in:
     https://github.com/pytorch/pytorch/blob/master/torch/ao/quantization/backend_config/README.md
@@ -149,6 +158,7 @@ class BackendConfig:
         backend_config = BackendConfig("my_backend") \
             .set_backend_pattern_config(linear_config) \
             .set_backend_pattern_config(conv_relu_config)
+
     """
     def __init__(self, name: str = ""):
         self.name = name
@@ -181,10 +191,12 @@ def set_backend_pattern_configs(self, configs: List[BackendPatternConfig]) -> Ba
     @classmethod
     def from_dict(cls, backend_config_dict: Dict[str, Any]) -> BackendConfig:
         """
-        Create a `BackendConfig` from a dictionary with the following items:
+        Create a ``BackendConfig`` from a dictionary with the following items:
 
             "name": the name of the target backend
+
             "configs": a list of dictionaries that each represents a `BackendPatternConfig`
+
         """
         conf = cls(backend_config_dict.get(NAME_DICT_KEY, ""))
         for d in backend_config_dict.get(CONFIGS_DICT_KEY, []):
@@ -198,7 +210,7 @@ def from_dict(cls, backend_config_dict: Dict[str, Any]) -> BackendConfig:
 
     def to_dict(self) -> Dict[str, Any]:
         """
-        Convert this `BackendConfig` to a dictionary with the items described in
+        Convert this ``BackendConfig`` to a dictionary with the items described in
         :func:`~torch.ao.quantization.backend_config.BackendConfig.from_dict`.
         """
         return {
@@ -210,19 +222,8 @@ def to_dict(self) -> Dict[str, Any]:
 class BackendPatternConfig:
     """
     Config for ops defined in :class:`~torch.ao.quantization.backend_config.BackendConfig`.
-
-    The user can configure how a operator pattern graph is handled on a given backend using the following methods:
-        `set_observation_type`: sets how observers should be inserted for this pattern.
-            See :class:`~torch.ao.quantization.backend_config.ObservationType`
-        `add_dtype_config`: add a set of supported data types for this pattern
-        `set_root_module`: sets the module that represents the root for this pattern
-        `set_qat_module`: sets the module that represents the QAT implementation for this pattern
-        `set_reference_quantized_module`: sets the module that represents the reference quantized
-            implementation for this pattern's root module.
-        `set_fused_module`: sets the module that represents the fused implementation for this pattern
-        `set_fuser_method`: sets the function that specifies how to fuse the pattern for this pattern
-
     For a detailed example usage, see :class:`~torch.ao.quantization.backend_config.BackendConfig`.
+
     """
     def __init__(self, pattern: Pattern):
         self.pattern = pattern
@@ -246,13 +247,15 @@ def __init__(self, pattern: Pattern):
     def set_observation_type(self, observation_type: ObservationType) -> BackendPatternConfig:
         """
         Set how observers should be inserted for this pattern.
+        See :class:`~torch.ao.quantization.backend_config.ObservationType` for details
+
         """
         self.observation_type = observation_type
         return self
 
     def add_dtype_config(self, dtype_config: DTypeConfig) -> BackendPatternConfig:
         """
-        Register a set of supported input/output activation, weight, and bias data types for this pattern.
+        Add a set of supported input/output activation, weight, and bias data types for this pattern.
         """
         self.dtype_configs.append(dtype_config)
         return self
@@ -333,22 +336,23 @@ def _set_overwrite_output_observer(self, overwrite_output_observer: _PartialWrap
     @classmethod
     def from_dict(cls, backend_pattern_config_dict: Dict[str, Any]) -> BackendPatternConfig:
         """
-        Create a `BackendPatternConfig` from a dictionary with the following items:
+        Create a ``BackendPatternConfig`` from a dictionary with the following items:
 
             "pattern": the pattern being configured
             "observation_type": the :class:`~torch.ao.quantization.backend_config.ObservationType` that specifies how
-                observers should be inserted for this pattern
-            "dtype_configs": a list of dictionaries that represents :class:`~torch.ao.quantization.backend_config.DTypeConfig`s
+            observers should be inserted for this pattern
+            "dtype_configs": a list of dictionaries that represents :class:`~torch.ao.quantization.backend_config.DTypeConfig` s
             "root_module": a :class:`torch.nn.Module` that represents the root for this pattern
             "qat_module": a :class:`torch.nn.Module` that represents the QAT implementation for this pattern
             "reference_quantized_module": a :class:`torch.nn.Module` that represents the reference quantized
-                implementation for this pattern's root module.
+            implementation for this pattern's root module.
             "fused_module": a :class:`torch.nn.Module` that represents the fused implementation for this pattern
             "fuser_method": a function that specifies how to fuse the pattern for this pattern
+
         """
         def _get_dtype_config(obj: Any) -> DTypeConfig:
             """
-            Convert the given object into a `DTypeConfig` if possible, else throw an exception.
+            Convert the given object into a ``DTypeConfig`` if possible, else throw an exception.
             """
             if isinstance(obj, DTypeConfig):
                 return obj
@@ -381,7 +385,7 @@ def _get_dtype_config(obj: Any) -> DTypeConfig:
 
     def to_dict(self) -> Dict[str, Any]:
         """
-        Convert this `BackendPatternConfig` to a dictionary with the items described in
+        Convert this ``BackendPatternConfig`` to a dictionary with the items described in
         :func:`~torch.ao.quantization.backend_config.BackendPatternConfig.from_dict`.
         """
         backend_pattern_config_dict: Dict[str, Any] = {
diff --git a/torch/ao/quantization/backend_config/native.py b/torch/ao/quantization/backend_config/native.py
index 7cc89e530f27..3da807fdb50e 100644
--- a/torch/ao/quantization/backend_config/native.py
+++ b/torch/ao/quantization/backend_config/native.py
@@ -8,6 +8,7 @@
     _get_embedding_op_configs,
     _get_fixed_qparams_op_configs,
     _get_linear_configs,
+    _get_ln_configs,
     _get_rnn_op_configs,
     _get_share_qparams_op_configs,
 )
@@ -61,6 +62,14 @@
     is_dynamic=True,
 )
 
+# Needed for LayerNorm and f.layer_norm, since currently the kernel only supports float weights
+input_output_only_quint8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.quint8,
+    weight_dtype=torch.float,
+    bias_dtype=torch.float,
+)
+
 weight_only_quint8_dtype_config = DTypeConfig(
     input_dtype=torch.float,
     output_dtype=torch.float,
@@ -110,6 +119,7 @@ def get_test_only_legacy_native_backend_config() -> BackendConfig:
         weight_only_quint8_dtype_config,
         weight_only_quint4x2_dtype_config,
     ]
+    layer_norm_op_dtype_configs = [input_output_only_quint8_dtype_config]
     return BackendConfig("_native_and_fp16") \
         .set_backend_pattern_configs(_get_conv_configs(conv_dtype_configs)) \
         .set_backend_pattern_configs(_get_linear_configs(linear_dtype_configs)) \
@@ -119,6 +129,7 @@ def get_test_only_legacy_native_backend_config() -> BackendConfig:
         .set_backend_pattern_configs(_get_fixed_qparams_op_configs(fixed_qparams_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_share_qparams_op_configs(share_qparams_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_bn_configs(default_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_ln_configs(layer_norm_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_rnn_op_configs(rnn_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_embedding_op_configs(embedding_op_dtype_configs))
 
@@ -145,6 +156,7 @@ def get_native_backend_config() -> BackendConfig:
         weight_only_quint8_dtype_config,
         weight_only_quint4x2_dtype_config,
     ]
+    layer_norm_op_dtype_configs = [input_output_only_quint8_dtype_config]
     return BackendConfig("native") \
         .set_backend_pattern_configs(_get_conv_configs(conv_dtype_configs)) \
         .set_backend_pattern_configs(_get_linear_configs(linear_dtype_configs)) \
@@ -154,6 +166,7 @@ def get_native_backend_config() -> BackendConfig:
         .set_backend_pattern_configs(_get_fixed_qparams_op_configs(fixed_qparams_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_share_qparams_op_configs(share_qparams_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_bn_configs(default_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_ln_configs(layer_norm_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_rnn_op_configs(rnn_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_embedding_op_configs(embedding_op_dtype_configs))
 
diff --git a/torch/ao/quantization/backend_config/utils.py b/torch/ao/quantization/backend_config/utils.py
index 6645cea80104..187fd7e1b703 100644
--- a/torch/ao/quantization/backend_config/utils.py
+++ b/torch/ao/quantization/backend_config/utils.py
@@ -6,6 +6,21 @@
 from .backend_config import BackendConfig, DTypeConfig
 from ..quantization_types import Pattern
 
+__all__ = [
+    "get_pattern_to_dtype_configs",
+    "get_qat_module_classes",
+    "get_fused_module_classes",
+    "get_pattern_to_input_type_to_index",
+    "get_root_module_to_quantized_reference_module",
+    "get_fuser_method_mapping",
+    "get_module_to_qat_module",
+    "get_fusion_pattern_to_root_node_getter",
+    "get_fusion_pattern_to_extra_inputs_getter",
+    "remove_boolean_dispatch_from_name",
+    "pattern_to_human_readable",
+    "entry_to_pretty_str",
+]
+
 def get_pattern_to_dtype_configs(backend_config: BackendConfig) -> Dict[Pattern, List[DTypeConfig]]:
     pattern_to_dtype_configs: Dict[Pattern, List[DTypeConfig]] = {}
     for pattern, config in backend_config.configs.items():
diff --git a/torch/ao/quantization/fx/_lower_to_native_backend.py b/torch/ao/quantization/fx/_lower_to_native_backend.py
index 3485480eb882..aa71fafbf00e 100644
--- a/torch/ao/quantization/fx/_lower_to_native_backend.py
+++ b/torch/ao/quantization/fx/_lower_to_native_backend.py
@@ -8,7 +8,7 @@
 import torch.nn.intrinsic.quantized.dynamic as nniqd
 import torch.ao.nn.quantized as nnq
 import torch.ao.nn.quantized.dynamic as nnqd
-import torch.ao.nn.quantized._reference as nnqr
+import torch.ao.nn.quantized.reference as nnqr
 from torch.ao.nn.quantized.modules.utils import WeightedQuantizedModule
 from .graph_module import QuantizedGraphModule
 from .utils import (
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index 3e196e2a50bf..8aa1022dede9 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -83,7 +83,6 @@
     "maybe_get_observer_for_node",
     "maybe_recursive_remove_dequantize",
     "remove_extra_dequantize",
-    "remove_quant_dequant_pairs",
     "restore_state",
     "run_weight_observers",
 ]
@@ -190,24 +189,6 @@ def remove_extra_dequantize(quantized: QuantizedGraphModule) -> QuantizedGraphMo
     quantized = QuantizedGraphModule(quantized_root, quantized.graph, quantized_root.preserved_attr_names)
     return quantized
 
-def remove_quant_dequant_pairs(quantized: QuantizedGraphModule) -> QuantizedGraphModule:
-    quantized_root = quantized
-    for node in quantized.graph.nodes:
-        if node.op == "call_function" and node.target in [torch.quantize_per_tensor, torch.quantize_per_channel]:
-            users = list(node.users)
-            user = users[0] if users else None
-            if len(users) == 1 and user.op == "call_method" and user.target == "dequantize":
-                user.replace_all_uses_with(node.args[0])
-                quantized.graph.erase_node(user)
-                orig_args = list(node.args)
-                quantized.graph.erase_node(node)
-                for arg in orig_args:
-                    if isinstance(arg, Node) and len(list(arg.users)) == 0:
-                        quantized.graph.erase_node(arg)
-
-    quantized = QuantizedGraphModule(quantized_root, quantized.graph, quantized_root.preserved_attr_names)
-    return quantized
-
 def maybe_recursive_remove_dequantize(arg: Any, node: Node, graph: Graph):
     """ If the arg is a dequantize Node, or a list/tuple/dict of dequantize Node,
     we'll recursively remove the dequantize Node
@@ -795,7 +776,6 @@ def replace_observer_with_dequantize_node(node: Node, graph: Graph):
     # TODO: maybe move this to quantize_fx.py
     if not is_reference:
         model = lower_to_fbgemm(model, qconfig_map, node_name_to_scope)
-        model = remove_quant_dequant_pairs(model)
     # TODO: this looks hacky, we want to check why we need this and see if we can
     # remove this
     # removes qconfig and activation_post_process modules
diff --git a/torch/ao/quantization/fx/custom_config.py b/torch/ao/quantization/fx/custom_config.py
index 1fdde5e51a33..0f5f5bfe8d15 100644
--- a/torch/ao/quantization/fx/custom_config.py
+++ b/torch/ao/quantization/fx/custom_config.py
@@ -42,18 +42,6 @@ class PrepareCustomConfig:
     Custom configuration for :func:`~torch.ao.quantization.quantize_fx.prepare_fx` and
     :func:`~torch.ao.quantization.quantize_fx.prepare_qat_fx`.
 
-    The user can set custom configuration using the following methods:
-
-        `set_standalone_module_name`: sets the config for preparing a standalone module for quantization, identified by name
-        `set_standalone_module_class`: sets the config for preparing a standalone module for quantization, identified by class
-        `set_float_to_observed_mapping`: sets the mapping from a float module class to an observed module class
-        `set_non_traceable_module_names`: sets modules that are not symbolically traceable, identified by name
-        `set_non_traceable_module_classes`: sets modules that are not symbolically traceable, identified by class
-        `set_input_quantized_indexes`: sets the indexes of the inputs of the graph that should be quantized.
-        `set_output_quantized_indexes`: sets the indexes of the outputs of the graph that should be quantized.
-        `set_preserved_attributes`: sets the names of the attributes that will persist in the graph module even
-            if they are not used in the model's `forward` method
-
     Example usage::
 
         prepare_custom_config = PrepareCustomConfig() \
@@ -86,11 +74,11 @@ def set_standalone_module_name(
             prepare_custom_config: Optional[PrepareCustomConfig],
             backend_config: Optional[BackendConfig]) -> PrepareCustomConfig:
         """
-        Set the configuration for running a standalone module identified by `module_name`.
+        Set the configuration for running a standalone module identified by ``module_name``.
 
-        If `qconfig_mapping` is None, the parent `qconfig_mapping` will be used instead.
-        If `prepare_custom_config` is None, an empty `PrepareCustomConfig` will be used.
-        If `backend_config` is None, the parent `backend_config` will be used instead.
+        If ``qconfig_mapping`` is None, the parent ``qconfig_mapping`` will be used instead.
+        If ``prepare_custom_config`` is None, an empty ``PrepareCustomConfig`` will be used.
+        If ``backend_config`` is None, the parent ``backend_config`` will be used instead.
         """
         self.standalone_module_names[module_name] = \
             StandaloneModuleConfigEntry(qconfig_mapping, example_inputs, prepare_custom_config, backend_config)
@@ -104,11 +92,11 @@ def set_standalone_module_class(
             prepare_custom_config: Optional[PrepareCustomConfig],
             backend_config: Optional[BackendConfig]) -> PrepareCustomConfig:
         """
-        Set the configuration for running a standalone module identified by `module_class`.
+        Set the configuration for running a standalone module identified by ``module_class``.
 
-        If `qconfig_mapping` is None, the parent `qconfig_mapping` will be used instead.
-        If `prepare_custom_config` is None, an empty `PrepareCustomConfig` will be used.
-        If `backend_config` is None, the parent `backend_config` will be used instead.
+        If ``qconfig_mapping`` is None, the parent ``qconfig_mapping`` will be used instead.
+        If ``prepare_custom_config`` is None, an empty ``PrepareCustomConfig`` will be used.
+        If ``backend_config`` is None, the parent ``backend_config`` will be used instead.
         """
         self.standalone_module_classes[module_class] = \
             StandaloneModuleConfigEntry(qconfig_mapping, example_inputs, prepare_custom_config, backend_config)
@@ -122,7 +110,7 @@ def set_float_to_observed_mapping(
         """
         Set the mapping from a custom float module class to a custom observed module class.
 
-        The observed module class must have a `from_float` class method that converts the float module class
+        The observed module class must have a ``from_float`` class method that converts the float module class
         to the observed module class. This is currently only supported for static quantization.
         """
         if quant_type != QuantType.STATIC:
@@ -165,7 +153,7 @@ def set_output_quantized_indexes(self, indexes: List[int]) -> PrepareCustomConfi
     def set_preserved_attributes(self, attributes: List[str]) -> PrepareCustomConfig:
         """
         Set the names of the attributes that will persist in the graph module even if they are not used in
-        the model's `forward` method.
+        the model's ``forward`` method.
         """
         self.preserved_attributes = attributes
         return self
@@ -174,23 +162,23 @@ def set_preserved_attributes(self, attributes: List[str]) -> PrepareCustomConfig
     @classmethod
     def from_dict(cls, prepare_custom_config_dict: Dict[str, Any]) -> PrepareCustomConfig:
         """
-        Create a `PrepareCustomConfig` from a dictionary with the following items:
+        Create a ``PrepareCustomConfig`` from a dictionary with the following items:
 
             "standalone_module_name": a list of (module_name, qconfig_mapping, example_inputs,
-                child_prepare_custom_config, backend_config) tuples
+            child_prepare_custom_config, backend_config) tuples
 
             "standalone_module_class" a list of (module_class, qconfig_mapping, example_inputs,
-                child_prepare_custom_config, backend_config) tuples
+            child_prepare_custom_config, backend_config) tuples
 
             "float_to_observed_custom_module_class": a nested dictionary mapping from quantization
-                mode to an inner mapping from float module classes to observed module classes, e.g.
-                {"static": {FloatCustomModule: ObservedCustomModule}}
+            mode to an inner mapping from float module classes to observed module classes, e.g.
+            {"static": {FloatCustomModule: ObservedCustomModule}}
 
             "non_traceable_module_name": a list of modules names that are not symbolically traceable
             "non_traceable_module_class": a list of module classes that are not symbolically traceable
             "input_quantized_idxs": a list of indexes of graph inputs that should be quantized
             "output_quantized_idxs": a list of indexes of graph outputs that should be quantized
-            "preserved_attributes": a list of attributes that persist even if they are not used in `forward`
+            "preserved_attributes": a list of attributes that persist even if they are not used in ``forward``
 
         This function is primarily for backward compatibility and may be removed in the future.
         """
@@ -255,7 +243,7 @@ def _get_backend_config(obj: Any, dict_key: str) -> Optional[BackendConfig]:
 
     def to_dict(self) -> Dict[str, Any]:
         """
-        Convert this `PrepareCustomConfig` to a dictionary with the items described in
+        Convert this ``PrepareCustomConfig`` to a dictionary with the items described in
         :func:`~torch.ao.quantization.fx.custom_config.PrepareCustomConfig.from_dict`.
         """
         def _make_tuple(key: Any, e: StandaloneModuleConfigEntry):
@@ -293,12 +281,6 @@ class ConvertCustomConfig:
     """
     Custom configuration for :func:`~torch.ao.quantization.quantize_fx.convert_fx`.
 
-    The user can set custom configuration using the following methods:
-
-        `set_observed_to_quantized_mapping`: sets the mapping from an observed module class to a quantized module class
-        `set_preserved_attributes`: sets the names of the attributes that will persist in the graph module even if they
-            are not used in the model's `forward` method
-
     Example usage::
 
         convert_custom_config = ConvertCustomConfig() \
@@ -318,7 +300,7 @@ def set_observed_to_quantized_mapping(
         """
         Set the mapping from a custom observed module class to a custom quantized module class.
 
-        The quantized module class must have a `from_observed` class method that converts the observed module class
+        The quantized module class must have a ``from_observed`` class method that converts the observed module class
         to the quantized module class.
         """
         if quant_type not in self.observed_to_quantized_mapping:
@@ -329,7 +311,7 @@ def set_observed_to_quantized_mapping(
     def set_preserved_attributes(self, attributes: List[str]) -> ConvertCustomConfig:
         """
         Set the names of the attributes that will persist in the graph module even if they are not used in
-        the model's `forward` method.
+        the model's ``forward`` method.
         """
         self.preserved_attributes = attributes
         return self
@@ -338,17 +320,16 @@ def set_preserved_attributes(self, attributes: List[str]) -> ConvertCustomConfig
     @classmethod
     def from_dict(cls, convert_custom_config_dict: Dict[str, Any]) -> ConvertCustomConfig:
         """
-        Create a `ConvertCustomConfig` from a dictionary with the following items:
+        Create a ``ConvertCustomConfig`` from a dictionary with the following items:
 
             "observed_to_quantized_custom_module_class": a nested dictionary mapping from quantization
-                mode to an inner mapping from observed module classes to quantized module classes, e.g.
-                {
-                    "static": {FloatCustomModule: ObservedCustomModule},
-                    "dynamic": {FloatCustomModule: ObservedCustomModule},
-                    "weight_only": {FloatCustomModule: ObservedCustomModule}
-                }
-
-            "preserved_attributes": a list of attributes that persist even if they are not used in `forward`
+            mode to an inner mapping from observed module classes to quantized module classes, e.g.::
+            {
+            "static": {FloatCustomModule: ObservedCustomModule},
+            "dynamic": {FloatCustomModule: ObservedCustomModule},
+            "weight_only": {FloatCustomModule: ObservedCustomModule}
+            }
+            "preserved_attributes": a list of attributes that persist even if they are not used in ``forward``
 
         This function is primarily for backward compatibility and may be removed in the future.
         """
@@ -362,7 +343,7 @@ def from_dict(cls, convert_custom_config_dict: Dict[str, Any]) -> ConvertCustomC
 
     def to_dict(self) -> Dict[str, Any]:
         """
-        Convert this `ConvertCustomConfig` to a dictionary with the items described in
+        Convert this ``ConvertCustomConfig`` to a dictionary with the items described in
         :func:`~torch.ao.quantization.fx.custom_config.ConvertCustomConfig.from_dict`.
         """
         d: Dict[str, Any] = {}
@@ -379,11 +360,6 @@ class FuseCustomConfig:
     """
     Custom configuration for :func:`~torch.ao.quantization.quantize_fx.fuse_fx`.
 
-    The user can set custom configuration using the following method:
-
-        `set_preserved_attributes`: sets the names of the attributes that will persist in the graph module
-            even if they are not used in the model's `forward` method
-
     Example usage::
 
         fuse_custom_config = FuseCustomConfig().set_preserved_attributes(["attr1", "attr2"])
@@ -395,7 +371,7 @@ def __init__(self):
     def set_preserved_attributes(self, attributes: List[str]) -> FuseCustomConfig:
         """
         Set the names of the attributes that will persist in the graph module even if they are not used in
-        the model's `forward` method.
+        the model's ``forward`` method.
         """
         self.preserved_attributes = attributes
         return self
@@ -404,9 +380,9 @@ def set_preserved_attributes(self, attributes: List[str]) -> FuseCustomConfig:
     @classmethod
     def from_dict(cls, fuse_custom_config_dict: Dict[str, Any]) -> FuseCustomConfig:
         """
-        Create a `ConvertCustomConfig` from a dictionary with the following items:
+        Create a ``ConvertCustomConfig`` from a dictionary with the following items:
 
-            "preserved_attributes": a list of attributes that persist even if they are not used in `forward`
+            "preserved_attributes": a list of attributes that persist even if they are not used in ``forward``
 
         This function is primarily for backward compatibility and may be removed in the future.
         """
@@ -416,7 +392,7 @@ def from_dict(cls, fuse_custom_config_dict: Dict[str, Any]) -> FuseCustomConfig:
 
     def to_dict(self) -> Dict[str, Any]:
         """
-        Convert this `FuseCustomConfig` to a dictionary with the items described in
+        Convert this ``FuseCustomConfig`` to a dictionary with the items described in
         :func:`~torch.ao.quantization.fx.custom_config.ConvertCustomConfig.from_dict`.
         """
         d: Dict[str, Any] = {}
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index 714db89f3435..bfb6f2df78d9 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -1345,7 +1345,10 @@ def _validate_fixed_qparams_qconfigs(model: GraphModule, qconfig_map: Dict[str,
                 raise ValueError("QConfigMapping must specify fixed qparams observer for fixed qparams op "
                                  "'%s' type: '%s'. Please use torch.ao.quantization.get_default_qconfig_mapping or "
                                  "torch.ao.quantization.get_default_qat_qconfig_mapping"
-                                 " instead." % (node.format_node(), module_type_or_function_or_method))
+                                 " instead. Example: \n"
+                                 "    qconfig_mapping = get_default_qconfig_mapping(\"fbgemm\") \n"
+                                 "    model = prepare_fx(model, qconfig_mapping, example_inputs)"
+                                 "" % (node.format_node(), module_type_or_function_or_method))
 
 def run_prepare_fx_on_standalone_modules(
     model: torch.nn.Module,
diff --git a/torch/ao/quantization/observer.py b/torch/ao/quantization/observer.py
index 63ec6eb48bb6..3781379faffd 100644
--- a/torch/ao/quantization/observer.py
+++ b/torch/ao/quantization/observer.py
@@ -1257,6 +1257,9 @@ def _load_from_state_dict(
             error_msgs,
         )
 
+    def extra_repr(self):
+        return "min_val={}, max_val={}".format(self.min_val, self.max_val)
+
 
 class FixedQParamsObserver(ObserverBase):
     r"""
diff --git a/torch/ao/quantization/qconfig_mapping.py b/torch/ao/quantization/qconfig_mapping.py
index b45972228690..59644171f5ec 100644
--- a/torch/ao/quantization/qconfig_mapping.py
+++ b/torch/ao/quantization/qconfig_mapping.py
@@ -12,6 +12,7 @@
     _PartialWrapper,
     default_fixed_qparams_range_0to1_observer,
     default_fixed_qparams_range_neg1to1_observer,
+    default_placeholder_observer,
     default_weight_observer,
 )
 from .qconfig import (
@@ -73,6 +74,10 @@ def _get_default_qconfig_mapping(is_qat: bool, backend: str, version: int) -> QC
     else:
         qconfig_transpose = qconfig
 
+    # currently layernorm only supports float weights
+    # we have to add this because otherwise there will be a extra quantize-dequantize pair
+    qconfig_layernorm = QConfig(activation=qconfig.activation, weight=default_placeholder_observer)
+
     qconfig_mapping = QConfigMapping() \
         .set_global(qconfig) \
         .set_object_type("reshape", default_reuse_input_qconfig) \
@@ -95,7 +100,9 @@ def _get_default_qconfig_mapping(is_qat: bool, backend: str, version: int) -> QC
         .set_object_type(torch.relu, qconfig) \
         .set_object_type(torch.nn.BatchNorm1d, qconfig) \
         .set_object_type(torch.nn.BatchNorm2d, qconfig) \
-        .set_object_type(torch.nn.BatchNorm3d, qconfig)
+        .set_object_type(torch.nn.BatchNorm3d, qconfig) \
+        .set_object_type(torch.nn.functional.layer_norm, qconfig_layernorm) \
+        .set_object_type(torch.nn.LayerNorm, qconfig_layernorm) \
 
     # Use special observers for ops with fixed qparams
     fixed_qparams_observer_to_qconfig: Dict[Any, QConfigAny] = {}
@@ -116,28 +123,39 @@ def _get_default_qconfig_mapping(is_qat: bool, backend: str, version: int) -> QC
 def get_default_qconfig_mapping(backend="fbgemm", version=0) -> QConfigMapping:
     """
     Return the default QConfigMapping for post training quantization.
+
+    Args:
+      * ``backend`` : the quantization backend for the default qconfig mapping, should be
+         one of ["fbgemm", "qnnpack"]
+      * ``version`` : the version for the default qconfig mapping
     """
+    # TODO: add assert for backend choices
     return _get_default_qconfig_mapping(False, backend, version)
 
 def get_default_qat_qconfig_mapping(backend="fbgemm", version=1) -> QConfigMapping:
     """
     Return the default QConfigMapping for quantization aware training.
+
+    Args:
+      * ``backend`` : the quantization backend for the default qconfig mapping, should be
+         one of ["fbgemm", "qnnpack"]
+      * ``version`` : the version for the default qconfig mapping
     """
     return _get_default_qconfig_mapping(True, backend, version)
 
 
 class QConfigMapping:
     """
-    Mapping from model ops to :class:`torch.ao.quantization.QConfig`s.
+    Mapping from model ops to :class:`torch.ao.quantization.QConfig` s.
 
     The user can specify QConfigs using the following methods (in increasing match priority):
 
-        `set_global`: sets the global (default) QConfig
-        `set_object_type`: sets the QConfig for a given module type, function, or method name
-        `set_module_name_regex`: sets the QConfig for modules matching the given regex string
-        `set_module_name`: sets the QConfig for modules matching the given module name
-        `set_module_name_object_type_order`: sets the QConfig for modules matching a combination
-            of the given module name, object type, and the index at which the module appears
+        ``set_global`` : sets the global (default) QConfig
+        ``set_object_type`` : sets the QConfig for a given module type, function, or method name
+        ``set_module_name_regex`` : sets the QConfig for modules matching the given regex string
+        ``set_module_name`` : sets the QConfig for modules matching the given module name
+        ``set_module_name_object_type_order`` : sets the QConfig for modules matching a combination
+        of the given module name, object type, and the index at which the module appears
 
     Example usage::
 
@@ -150,6 +168,7 @@ class QConfigMapping:
             .set_module_name("module1", qconfig1)
             .set_module_name("module2", qconfig2)
             .set_module_name_object_type_order("foo.bar", torch.nn.functional.linear, 0, qconfig3)
+
     """
 
     def __init__(self):
@@ -224,12 +243,16 @@ def set_module_name_object_type_order(
     # TODO: remove this
     def to_dict(self) -> Dict[str, Any]:
         """
-        Convert this `QConfigMapping` to a dictionary with the following keys:
+        Convert this ``QConfigMapping`` to a dictionary with the following keys:
 
             "" (for global QConfig)
+
             "object_type"
+
             "module_name_regex"
+
             "module_name"
+
             "module_name_object_type_order"
 
         The values of this dictionary are lists of tuples.
@@ -248,12 +271,16 @@ def to_dict(self) -> Dict[str, Any]:
     @classmethod
     def from_dict(cls, qconfig_dict: Dict[str, Any]) -> QConfigMapping:
         """
-        Create a `QConfigMapping` from a dictionary with the following keys (all optional):
+        Create a ``QConfigMapping`` from a dictionary with the following keys (all optional):
 
             "" (for global QConfig)
+
             "object_type"
+
             "module_name_regex"
+
             "module_name"
+
             "module_name_object_type_order"
 
         The values of this dictionary are expected to be lists of tuples.
diff --git a/torch/ao/quantization/quantization_mappings.py b/torch/ao/quantization/quantization_mappings.py
index 6e658ae0dfec..63bbe31045d3 100644
--- a/torch/ao/quantization/quantization_mappings.py
+++ b/torch/ao/quantization/quantization_mappings.py
@@ -9,7 +9,7 @@
 import torch.nn.intrinsic.quantized.dynamic as nniqd
 import torch.nn.intrinsic.qat as nniqat
 import torch.ao.nn.quantized as nnq
-import torch.ao.nn.quantized._reference as nnqr
+import torch.ao.nn.quantized.reference as nnqr
 import torch.ao.nn.quantized.dynamic as nnqd
 import torch.ao.nn.qat as nnqat
 import torch.ao.nn.qat.dynamic as nnqatd
diff --git a/torch/ao/quantization/quantize_fx.py b/torch/ao/quantization/quantize_fx.py
index 8d572e0c4f23..fb6f3dc1fe57 100644
--- a/torch/ao/quantization/quantize_fx.py
+++ b/torch/ao/quantization/quantize_fx.py
@@ -236,13 +236,9 @@ def fuse_fx(
 
     Args:
 
-        * `model`: a torch.nn.Module model
-        * `fuse_custom_config`: custom configurations for fuse_fx.
-            See :class:`~torch.ao.quantization.fx.custom_config.FuseCustomConfig` for more detail::
-
-                from torch.ao.quantization.fx.custom_config import FuseCustomConfig
-                fuse_custom_config = FuseCustomConfig().set_preserved_attributes(["preserved_attr"])
-
+        * `model` (torch.nn.Module): a torch.nn.Module model
+        * `fuse_custom_config` (FuseCustomConfig): custom configurations for fuse_fx.
+            See :class:`~torch.ao.quantization.fx.custom_config.FuseCustomConfig` for more details
     Example::
 
         from torch.ao.quantization import fuse_fx
@@ -280,52 +276,24 @@ def prepare_fx(
     r""" Prepare a model for post training static quantization
 
     Args:
-      * `model` (required): torch.nn.Module model, must be in eval mode
-
-      * `qconfig_mapping` (required): mapping from model ops to qconfigs::
-
-          from torch.quantization import QConfigMapping
-
-          qconfig_mapping = QConfigMapping() \
-              .set_global(global_qconfig) \
-              .set_object_type(torch.nn.Linear, qconfig1) \
-              .set_object_type(torch.nn.functional.linear, qconfig1) \
-              .set_module_name_regex("foo.*bar.*conv[0-9]+", qconfig1) \
-              .set_module_name_regex("foo.*bar.*", qconfig2) \
-              .set_module_name_regex("foo.*", qconfig3) \
-              .set_module_name("module1", qconfig1) \
-              .set_module_name("module2", qconfig2) \
-              .set_module_name_object_type_order("module3", torch.nn.functional.linear, 0, qconfig3)
+      * `model` (torch.nn.Module): torch.nn.Module model
 
+      * `qconfig_mapping` (QConfigMapping): QConfigMapping object to configure how a model is
+         quantized, see :class:`~torch.ao.quantization.qconfig_mapping.QConfigMapping`
+         for more details
 
-          The precedence of different settings:
-          set_global < set_object_type < set_module_name_regex < set_module_name < set_module_name_object_type_order
-      * `example_inputs`: (required) Example inputs for forward function of the model
+      * `example_inputs` (Tuple[Any, ...]): Example inputs for forward function of the model,
+         Tuple of positional args (keyword args can be passed as positional args as well)
 
-      * `prepare_custom_config`: customization configuration for quantization tool.
-          See :class:`~torch.ao.quantization.fx.custom_config.PrepareCustomConfig` for more detail::
-
-              from torch.ao.quantization.fx.custom_config import PrepareCustomConfig
-
-              prepare_custom_config = PrepareCustomConfig() \
-                  .set_standalone_module_name("module1", qconfig_mapping, example_inputs, \
-                      child_prepare_custom_config, backend_config) \
-                  .set_standalone_module_class(MyStandaloneModule, qconfig_mapping, example_inputs, \
-                      child_prepare_custom_config, backend_config) \
-                  .set_float_to_observed_mapping(FloatCustomModule, ObservedCustomModule) \
-                  .set_non_traceable_module_names(["module2", "module3"]) \
-                  .set_non_traceable_module_classes([NonTraceableModule1, NonTraceableModule2]) \
-                  .set_input_quantized_indexes([0]) \
-                  .set_output_quantized_indexes([0]) \
-                  .set_preserved_attributes(["attr1", "attr2"])
+      * `prepare_custom_config` (PrepareCustomConfig): customization configuration for quantization tool.
+          See :class:`~torch.ao.quantization.fx.custom_config.PrepareCustomConfig` for more details
 
       * `_equalization_config`: config for specifying how to perform equalization on the model
 
-      * `backend_config`: config that specifies how operators are quantized
+      * `backend_config` (BackendConfig): config that specifies how operators are quantized
          in a backend, this includes how the operaetors are observed,
          supported fusion patterns, how quantize/dequantize ops are
-         inserted, supported dtypes etc. The structure of the dictionary is still WIP
-         and will change in the future, please don't use right now.
+         inserted, supported dtypes etc. See :class:`~torch.ao.quantization.backend_config.BackendConfig` for more details
 
     Return:
       A GraphModule with observer (configured by qconfig_mapping), ready for calibration
@@ -458,14 +426,14 @@ def prepare_qat_fx(
     r""" Prepare a model for quantization aware training
 
     Args:
-      * `model`: torch.nn.Module model, must be in train mode
-      * `qconfig_mapping`: see :func:`~torch.ao.quantization.prepare_fx`
-      * `example_inputs`: see :func:`~torch.ao.quantization.prepare_fx`
-      * `prepare_custom_config`: see :func:`~torch.ao.quantization.prepare_fx`
-      * `backend_config`: see :func:`~torch.ao.quantization.prepare_fx`
+      * `model` (torch.nn.Module): torch.nn.Module model
+      * `qconfig_mapping` (QConfigMapping): see :func:`~torch.ao.quantization.prepare_fx`
+      * `example_inputs` (Tuple[Any, ...]): see :func:`~torch.ao.quantization.prepare_fx`
+      * `prepare_custom_config` (PrepareCustomConfig): see :func:`~torch.ao.quantization.prepare_fx`
+      * `backend_config` (BackendConfig): see :func:`~torch.ao.quantization.prepare_fx`
 
     Return:
-      A GraphModule with fake quant modules (configured by qconfig_mapping), ready for
+      A GraphModule with fake quant modules (configured by qconfig_mapping and backend_config), ready for
       quantization aware training
 
     Example::
@@ -602,23 +570,14 @@ def convert_fx(
     r""" Convert a calibrated or trained model to a quantized model
 
     Args:
-        * `graph_module`: A prepared and calibrated/trained model (GraphModule)
-        * `is_reference`: flag for whether to produce a reference quantized model,
-          which will be a common interface between pytorch quantization with
-          other backends like accelerators
-
-        * `convert_custom_config`: custom configurations for convert function.
-            See :class:`~torch.ao.quantization.fx.custom_config.ConvertCustomConfig` for more detail::
-
-                from torch.ao.quantization.fx.custom_config import ConvertCustomConfig
+        * `graph_module` (torch.fx.GraphModule): A prepared and calibrated/trained model (GraphModule)
 
-                convert_custom_config = ConvertCustomConfig() \
-                    .set_observed_to_quantized_mapping(ObservedCustomModule, QuantizedCustomModule) \
-                    .set_preserved_attributes(["attr1", "attr2"])
+        * `convert_custom_config` (ConvertCustomConfig): custom configurations for convert function.
+            See :class:`~torch.ao.quantization.fx.custom_config.ConvertCustomConfig` for more details
 
-        * `_remove_qconfig`: Option to remove the qconfig attributes in the model after convert.
+        * `_remove_qconfig` (bool): Option to remove the qconfig attributes in the model after convert.
 
-        * `qconfig_mapping`: config for specifying how to convert a model for quantization.
+        * `qconfig_mapping` (QConfigMapping): config for specifying how to convert a model for quantization.
 
            The keys must include the ones in the qconfig_mapping passed to `prepare_fx` or `prepare_qat_fx`,
            with the same values or `None`. Additional keys can be specified with values set to `None`.
@@ -631,14 +590,14 @@ def convert_fx(
                 .set_object_type(torch.nn.functional.linear, qconfig_from_prepare)
                 .set_module_name("foo.bar", None)  # skip quantizing module "foo.bar"
 
-         * `backend_config`: A configuration for the backend which describes how
+         * `backend_config` (BackendConfig): A configuration for the backend which describes how
             operators should be quantized in the backend, this includes quantization
             mode support (static/dynamic/weight_only), dtype support (quint8/qint8 etc.),
-            observer placement for each operators and fused operators. Detailed
-            documentation can be found in torch/ao/quantization/backend_config/README.md
+            observer placement for each operators and fused operators.
+            See :class:`~torch.ao.quantization.backend_config.BackendConfig` for more details
 
     Return:
-        A quantized model (GraphModule)
+        A quantized model (torch.nn.Module)
 
     Example::
 
@@ -682,19 +641,19 @@ def convert_to_reference_fx(
     hardware, like accelerators
 
     Args:
-        * `graph_module`: A prepared and calibrated/trained model (GraphModule)
+        * `graph_module` (GraphModule): A prepared and calibrated/trained model (GraphModule)
 
-        * `convert_custom_config`: custom configurations for convert function.
-            See :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more detail.
+        * `convert_custom_config` (ConvertCustomConfig): custom configurations for convert function.
+            See :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details.
 
-        * `_remove_qconfig`: Option to remove the qconfig attributes in the model after convert.
+        * `_remove_qconfig` (bool): Option to remove the qconfig attributes in the model after convert.
 
-        * `qconfig_mapping`: config for specifying how to convert a model for quantization.
-            See :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more detail.
+        * `qconfig_mapping` (QConfigMapping): config for specifying how to convert a model for quantization.
+            See :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details.
 
-         * `backend_config`: A configuration for the backend which describes how
+         * `backend_config` (BackendConfig): A configuration for the backend which describes how
             operators should be quantized in the backend. See
-            :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more detail.
+            :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details.
 
     Return:
         A reference quantized model (GraphModule)
diff --git a/torch/ao/sparsity/_mappings.py b/torch/ao/sparsity/_mappings.py
index c831b3ddce2b..281450bcb29e 100644
--- a/torch/ao/sparsity/_mappings.py
+++ b/torch/ao/sparsity/_mappings.py
@@ -1,3 +1,8 @@
+__all__ = [
+    "get_static_sparse_quantized_mapping",
+    "get_dynamic_sparse_quantized_mapping",
+]
+
 def get_static_sparse_quantized_mapping():
     import torch.ao.nn.sparse
     _static_sparse_quantized_mapping = dict({
diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py
index 7379cc63625f..2f43423a2bd6 100644
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@@ -368,7 +368,7 @@ def _get_analytical_jacobian_forward_ad(fn, inputs, outputs, *, check_grad_dtype
                     dual_outputs = filter(_is_float_or_complex_tensor, raw_outputs)
                     for index_o, d_o in enumerate(dual_outputs):
                         val, res = fwAD.unpack_dual(d_o)
-                        if check_grad_dtypes and val.is_complex() != res.is_complex():
+                        if check_grad_dtypes and res is not None and val.is_complex() != res.is_complex():
                             raise GradcheckError('Forward AD gradient has dtype mismatch.')
 
                         if res is None:
diff --git a/torch/backends/_coreml/preprocess.py b/torch/backends/_coreml/preprocess.py
index 00e9f5d1cbe1..f72dae177ed2 100644
--- a/torch/backends/_coreml/preprocess.py
+++ b/torch/backends/_coreml/preprocess.py
@@ -49,8 +49,9 @@ def CompileSpec(inputs,
                 outputs,
                 backend=CoreMLComputeUnit.CPU,
                 allow_low_precision=True,
-                quantization_mode=CoreMLQuantizationMode.NONE):
-    return (inputs, outputs, backend, allow_low_precision, quantization_mode)
+                quantization_mode=CoreMLQuantizationMode.NONE,
+                mlmodel_export_path=None):
+    return (inputs, outputs, backend, allow_low_precision, quantization_mode, mlmodel_export_path)
 
 
 def _check_enumerated_shape(shape):
@@ -71,7 +72,7 @@ def _convert_to_mil_type(shape, dtype, name: str):
 
 def preprocess(script_module: torch._C.ScriptObject, compile_spec: Dict[str, Tuple]):
     spec = compile_spec["forward"]
-    input_specs, output_specs, backend, allow_low_precision, quantization_mode = spec
+    input_specs, output_specs, backend, allow_low_precision, quantization_mode, mlmodel_export_path = spec
     mil_inputs = []
     inputs = []
     for index, input in enumerate(input_specs):
@@ -96,6 +97,11 @@ def preprocess(script_module: torch._C.ScriptObject, compile_spec: Dict[str, Tup
         outputs.append([name, str(dtype), str(shape)])
     mlmodel = ct.models.model.MLModel(spec)
     print(mlmodel)
+
+    if mlmodel_export_path is not None:
+        print(f"Saving CoreML .mlmodel file to {mlmodel_export_path}")
+        mlmodel.save(mlmodel_export_path)
+
     config = {
         "spec_ver": str(spec.specificationVersion),  # type: ignore[attr-defined]
         "backend": backend,
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index d7fdbd2363ab..5895e1275d2f 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -42,6 +42,7 @@
 #include <torch/csrc/autograd/python_fft_functions.h>
 #include <torch/csrc/autograd/python_legacy_variable.h>
 #include <torch/csrc/autograd/python_linalg_functions.h>
+#include <torch/csrc/autograd/python_nested_functions.h>
 #include <torch/csrc/autograd/python_nn_functions.h>
 #include <torch/csrc/autograd/python_return_types.h>
 #include <torch/csrc/autograd/python_sparse_functions.h>
@@ -1018,6 +1019,7 @@ PyObject* initModule() {
   torch::autograd::initNNFunctions(module);
   torch::autograd::initFFTFunctions(module);
   torch::autograd::initLinalgFunctions(module);
+  torch::autograd::initNestedFunctions(module);
   torch::autograd::initSparseFunctions(module);
   torch::autograd::initSpecialFunctions(module);
   torch::autograd::init_legacy_variable(module);
@@ -1283,6 +1285,14 @@ Call this whenever a new thread is created in order to propagate values from
     std::cout << "Excluded: " << toString(local_keyset.excluded_) << "\n";
   });
 
+  py_module.def("_is_deploy_enabled", []() {
+#if defined(USE_DEPLOY)
+    return true;
+#else
+    return false;
+#endif
+  });
+
   const auto& defaultGenerator = at::detail::getDefaultCPUGenerator();
   THPDefaultCPUGenerator =
       (THPGenerator*)THPGenerator_initDefaultGenerator(defaultGenerator);
diff --git a/torch/csrc/api/include/torch/all.h b/torch/csrc/api/include/torch/all.h
index 5228dbd8637a..02086ceab254 100644
--- a/torch/csrc/api/include/torch/all.h
+++ b/torch/csrc/api/include/torch/all.h
@@ -11,6 +11,7 @@
 #include <torch/fft.h>
 #include <torch/jit.h>
 #include <torch/linalg.h>
+#include <torch/nested.h>
 #include <torch/nn.h>
 #include <torch/optim.h>
 #include <torch/serialize.h>
diff --git a/torch/csrc/api/include/torch/nested.h b/torch/csrc/api/include/torch/nested.h
new file mode 100644
index 000000000000..9ea4989a0742
--- /dev/null
+++ b/torch/csrc/api/include/torch/nested.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <torch/torch.h>
+
+namespace torch {
+namespace nested {
+
+/// Nested to padded tensor
+///
+/// See
+/// https://pytorch.org/docs/master/nested.html#torch.nested.to_padded_tensor
+///
+/// ```
+inline Tensor to_padded_tensor(
+    const Tensor& self,
+    double padding,
+    OptionalIntArrayRef output_size = c10::nullopt) {
+  return torch::nested_to_padded_tensor(self, padding, output_size);
+}
+
+} // namespace nested
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/functional/upsampling.h b/torch/csrc/api/include/torch/nn/functional/upsampling.h
index 37d6fab99480..a190a79badb0 100644
--- a/torch/csrc/api/include/torch/nn/functional/upsampling.h
+++ b/torch/csrc/api/include/torch/nn/functional/upsampling.h
@@ -118,8 +118,12 @@ inline Tensor interpolate(
   }
 
   if (antialias &&
-      !(input.dim() == 4 && (c10::get_if<enumtype::kBilinear>(&mode)))) {
-    TORCH_CHECK(false, "Anti-alias option is only supported for bilinear mode");
+      !(input.dim() == 4 &&
+        (c10::get_if<enumtype::kBilinear>(&mode) ||
+         c10::get_if<enumtype::kBicubic>(&mode)))) {
+    TORCH_CHECK(
+        false,
+        "Anti-alias option is only supported for bilinear and bicubic modes");
   }
 
   auto closed_over_args =
@@ -215,6 +219,14 @@ inline Tensor interpolate(
         scale_factor_list.at(2));
   } else if (input.dim() == 4 && c10::get_if<enumtype::kBicubic>(&mode)) {
     TORCH_INTERNAL_ASSERT(align_corners != c10::nullopt);
+    if (antialias) {
+      return torch::_upsample_bicubic2d_aa(
+          input,
+          _interp_output_size(2, closed_over_args),
+          *align_corners,
+          scale_factor_list.at(0),
+          scale_factor_list.at(1));
+    }
     return torch::upsample_bicubic2d(
         input,
         _interp_output_size(2, closed_over_args),
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 7ad92e83f08e..b7d99141e121 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -357,12 +357,12 @@ Tensor _nested_from_padded_backward(
   if (do_transform_0213) {
     auto new_sizes = {
         input.size(0), input.size(2), (input.size(1) * input.size(3))};
-    auto out = grad.to_padded_tensor(0, new_sizes);
+    auto out = nested_to_padded_tensor(grad, 0, new_sizes);
     auto expand_last_dim_size = {
         input.size(0), input.size(2), input.size(1), input.size(3)};
     return out.view(expand_last_dim_size).permute({0, 2, 1, 3});
   }
-  return grad.to_padded_tensor(0, input.sizes());
+  return nested_to_padded_tensor(grad, 0, input.sizes());
 }
 
 Tensor linalg_vector_norm_jvp(
@@ -677,6 +677,18 @@ Tensor prod_safe_zeros_backward(
   return grad * (exclusive_normal * exclusive_reverse).conj();
 }
 
+// checking the storage also encompasses FakeTensors, which report device and
+// dispatch keys as non-meta when not in an composite explicit kernel
+// invocation. because these backwards are implicit kernels fake tensors do not
+// appear as meta.
+bool is_meta_in_composite_kernels(const Tensor& t) {
+  if (t.is_meta()) {
+    return true;
+  }
+  return t.has_storage() &&
+      t.storage().data_ptr().device() == c10::DeviceType::Meta;
+}
+
 // note that the gradient for prod is equivalent to:
 // cumprod(exclusive, normal) * cumprod(exclusive, reverse), e.g.:
 // input:                        [    a,     b,     c]
@@ -691,7 +703,7 @@ Tensor prod_backward(
   if (input.dim() == 0) {
     return grad;
   }
-  if (input.is_meta()) {
+  if (is_meta_in_composite_kernels(input)) {
     return prod_safe_zeros_backward(grad, input.contiguous().view(-1), 0)
         .view_as(input);
   }
@@ -720,7 +732,7 @@ Tensor prod_backward(
     grad = grad.unsqueeze(dim);
     result = result.unsqueeze(dim);
   }
-  if (input.is_meta()) {
+  if (is_meta_in_composite_kernels(input)) {
     return prod_safe_zeros_backward(grad, input, dim);
   }
 
@@ -1245,8 +1257,10 @@ Tensor mm_mat1_sparse_backward(
   } else if (
       grad.layout() == c10::kStrided && mat2.layout() == c10::kStrided &&
       mat1.is_sparse_csr()) {
-    return at::sparse_sampled_addmm(
-        at::zeros_like(mat1, mat1.options()), grad, mat2.mH(), 1.0, alpha);
+    // zero must to have mat1 sparsity pattern:
+    auto zero = mat1.clone();
+    zero.values().zero_();
+    return at::sparse_sampled_addmm(zero, grad, mat2.mH(), 1.0, alpha);
   } else if (
       grad.layout() == c10::kStrided && mat2.layout() == c10::kStrided &&
       mat1.layout() == c10::kStrided) {
@@ -2976,7 +2990,10 @@ std::tuple<Tensor, Tensor, Tensor> prelu_double_backward(
     }
 
     Tensor ggO;
-    if (gO.requires_grad()) {
+    // areAnyTensorSubclassLike check necessary for composite compiance:
+    // e.g. it's possible that grad_out/gO is a BatchedTensor wrapping
+    // some Tensor that does require grad
+    if (areAnyTensorSubclassLike({grad_out}) || gO.requires_grad()) {
       // expand weight as input as in ggW/ggI above
       auto weight_expanded = weight;
       for (const auto i : c10::irange(dims_to_unsqueeze)) {
@@ -3440,151 +3457,6 @@ Tensor svd_backward(
   return gA;
 }
 
-// The implementation follows:
-// "An extended collection of matrix derivative results for forward and reverse
-// mode algorithmic differentiation"
-// https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf
-// However, the reference does not cover the constraints on eigenvectors to have
-// 1-norm. See the details below.
-Tensor eig_backward(
-    const std::vector<torch::autograd::Variable>& grads,
-    const Tensor& self,
-    bool is_eigvec_tensor_nonempty,
-    const Tensor& eigenvalues,
-    const Tensor& eigenvectors) {
-  at::NoTF32Guard disable_tf32;
-  TORCH_CHECK(
-      is_eigvec_tensor_nonempty,
-      "eig_backward: torch.eig(eigenvalues=False) is not differentiable. ",
-      "Please use torch.linalg.eigvals");
-
-  // variable names correspond to the ones in the reference document
-  auto D = eigenvalues;
-  const auto& U = eigenvectors;
-  auto D_grad = grads[0];
-  auto U_grad = grads[1];
-
-  // The condition below is trying to marry torch.eig and torch.linalg.eig
-  // for real inputs.
-  //
-  // For real inputs torch.eig returns a real 2D tensor representing real and
-  // complex components of eigenvalues, while torch.linalg.eig will most likely
-  // always return complex eigenvalues.
-  if (!self.is_complex()) {
-    Tensor is_imag_eigvals_zero;
-    // path for torch.eig with always a "real" 2D tensor of eigenvalues
-    if (!D.is_complex()) {
-      // narrow extracts the column corresponding to the imaginary part
-      is_imag_eigvals_zero = (D.narrow(-1, 1, 1) == 0.0).min();
-    }
-    // path for torch.linalg.eig with always a complex tensor of eigenvalues
-    else {
-      is_imag_eigvals_zero = (at::imag(D) == 0.0).min();
-      // insert an additional dimension to be compatible with torch.eig.
-      // Recall that it produces 2D tensors.
-      // We extract only the real parts as there is no support for
-      // complex eigenvalues with real inputs yet.
-      D = at::real(D).unsqueeze(-1);
-      D_grad = at::real(D_grad).unsqueeze(-1);
-    }
-    // No support for complex eigenvalues for real inputs yet.
-    TORCH_CHECK(
-        at::is_scalar_tensor_true(is_imag_eigvals_zero),
-        "eig_backward: Backward calculation does not support complex eigenvalues for real inputs at the moment.");
-  } else {
-    // torch.eig returns 2d tensors for eigenvalues,
-    // while torch.linalg.eig returns 1d.
-    // Hence we insert additional dimension for complex input,
-    // such that the same code could be used for both methods.
-    // It will become unnecessary once torch.eig is deprecated.
-    D = D.unsqueeze(-1);
-    if (D_grad.defined()) {
-      D_grad = D_grad.unsqueeze(-1);
-    }
-  }
-
-  if (!D_grad.defined() && !U_grad.defined()) {
-    return at::zeros_like(self, at::MemoryFormat::Contiguous);
-  }
-
-  // Adapting the result from the reference above for the complex input, we get:
-  //
-  // A_grad = U^{-H} (D_grad + F.conj() * (U^H U_grad)) U^H,
-  // where M^H := (M.mT()).conj() and * is the Hadamard (element-wise) product.
-  //
-  // torch.eig/torch.linalg.eig produce eigenvectors which are
-  // normalized to 1 norm, and the reference does not take that into account.
-  // Hence, we have to modify the formula accordingly.
-  //
-  // Normalization to 1 norm imposes the following constraint on the
-  // eigenvectors, i.e. (U^H U) * I = I, where I is an identity matrix. Forward
-  // AD for this expression yields: (dU^H U + U^H dU) * I = 0 => U^H dU * I = 0
-  // <=> diag(U^H dU) = 0, which means that each i-th column of U is orthogonal
-  // to the i-th column of dU. Now, the value of dU which does not take this
-  // constraint into consideration comes straight from the reference: dU = U(F *
-  // U^{-1} dA U). To make sure that U^H dU * I = 0, and using U^H U * I = I
-  // (normalization), we propose a modifed forward AD for U: dU_new = dU - U(U^H
-  // dU * I) (think of Gram-Schmidt)
-  //
-  // The rest is very similar to what is done in the reference and we finally
-  // arrive at:
-  //
-  // A_grad = U^{-H} (D_grad + (U^H U_grad - U^H U (U^H U_grad * I)) * F.conj())
-  // U^H
-  //        = U^{-H} (eigenvalues_contribs + eigenvectors_contrib) U^H, where
-  // eigenvalues_contribs := D_grad,
-  // eigenvectors_contribs := (U^H U_grad - U^H U (U^H U_grad * I)) * F.conj().
-  // The contributions from the eigenvectors and the eigenvalues are computed
-  // below, and then we solve the system U^H A_grad = (eigenvalues_contribs +
-  // eigenvectors_contribs) U_H to produce A_grad.
-
-  // contribution from the eigenvectors
-  Tensor U_contrib;
-  if (U_grad.defined()) {
-    // narrow extracts the column corresponding to the real part
-    D = D.narrow(-1, 0, 1);
-    auto F = (D.mT() - D);
-    if (!F.is_complex()) {
-      F.diagonal(0, -2, -1).fill_(INFINITY);
-      F.pow_(-1);
-    } else {
-      // The F matrix construction for complex eigenvalues
-      // if different from its real counterpart.
-      // There is no complex INFINITY, and we cannot use
-      //
-      // F.pow_(-1);
-      // F.diagonal(0, -2, -1).fill_(0);
-      //
-      // as it breaks gradgradcheck by double backward
-      // propagating nans through F.pow_(-1) at zero,
-      // the point of discontinuity.
-      // Hence this hack below.
-      F.diagonal(0, -2, -1).fill_(1);
-      F.pow_(-1);
-      F.diagonal(0, -2, -1).fill_(0);
-    }
-    auto U_grad_proj_onto_U = at::matmul(U.mH(), U_grad);
-    auto Uh_U = at::matmul(U.mH(), U);
-    U_contrib = (U_grad_proj_onto_U -
-                 Uh_U * U_grad_proj_onto_U.diagonal(0, -2, -1).unsqueeze(-2)) *
-        F.conj();
-  } else {
-    U_contrib = at::zeros_like(self, at::MemoryFormat::Contiguous);
-  }
-
-  // contributions from the eigenvalues
-  Tensor D_contrib;
-  if (D_grad.defined()) {
-    // narrow extracts the column corresponding to the real part
-    D_contrib = D_grad.narrow(-1, 0, 1);
-  } else {
-    D_contrib = at::zeros_like(D, at::MemoryFormat::Contiguous);
-  }
-
-  return at::linalg_solve(
-      U.mH(), at::matmul(U_contrib, U.mH()) + D_contrib * U.mH());
-}
-
 Tensor linalg_eig_backward(
     const Tensor& gL,
     const Tensor& gV,
@@ -6518,6 +6390,40 @@ std::tuple<Tensor, Tensor> _cudnn_convolution_backward(
   return result;
 }
 
+Tensor scatter_reduce_jvp(
+    const Tensor& self_p,
+    const Tensor& self_t,
+    int dim,
+    const Tensor& index,
+    const Tensor& src_p,
+    const Tensor& src_t,
+    c10::string_view reduce,
+    bool include_self,
+    const Tensor& result) {
+  if (reduce == "sum" || reduce == "mean") {
+    // The function is linear
+    return at::scatter_reduce(self_t, dim, index, src_t, reduce, include_self);
+    //  auto mask = x == restore_reduced_dims(result, dim, keepdim);
+    //  return at::where(mask, dx, 0.).sum(dim, keepdim) / mask.sum(dim,
+    //  keepdim);
+  } else if (reduce == "amin" || reduce == "amax") {
+    auto gather_result = at::gather(result, dim, index);
+    auto mask_self = self_p == result;
+    auto mask_src = src_p == gather_result;
+    auto masked_src_t = at::where(mask_src, src_t, 0.);
+    auto div =
+        mask_self.to(self_t.dtype())
+            .scatter_reduce(
+                dim, index, mask_src.to(self_t.dtype()), "sum", include_self);
+    return at::where(mask_self, self_t, 0.)
+        .scatter_reduce(dim, index, masked_src_t, "sum", include_self)
+        .div(div);
+  } else {
+    // Not implemented
+    return Tensor{};
+  }
+}
+
 std::tuple<Tensor, Tensor> scatter_reduce_backward(
     const Tensor& grad,
     const Tensor& self,
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index d8fb0923eed5..11d9cf9a9df0 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -628,12 +628,6 @@ Tensor linalg_qr_backward(
     const Tensor& Q,
     const Tensor& R,
     const c10::string_view mode);
-Tensor eig_backward(
-    const std::vector<torch::autograd::Variable>& grads,
-    const Tensor& self,
-    bool eigenvectors,
-    const Tensor& lambda,
-    const Tensor& v);
 Tensor linalg_matrix_exp_differential(
     const Tensor& self,
     const Tensor& grad,
@@ -984,6 +978,17 @@ std::tuple<Tensor, Tensor> _cudnn_convolution_backward(
     int64_t groups,
     ::std::array<bool, 2> output_mask);
 
+Tensor scatter_reduce_jvp(
+    const Tensor& self_p,
+    const Tensor& self_t,
+    int dim,
+    const Tensor& index,
+    const Tensor& src_p,
+    const Tensor& src_t,
+    c10::string_view reduce,
+    bool include_self,
+    const Tensor& result);
+
 std::tuple<Tensor, Tensor> scatter_reduce_backward(
     const Tensor& grad,
     const Tensor& self,
diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h
index b6c92a396ec0..1cde83470403 100644
--- a/torch/csrc/autograd/VariableTypeUtils.h
+++ b/torch/csrc/autograd/VariableTypeUtils.h
@@ -2,6 +2,9 @@
 
 #include <c10/util/irange.h>
 
+#include <ATen/core/boxing/KernelFunction.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+
 #include <torch/csrc/autograd/edge.h>
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/autograd/functions/basic_ops.h>
@@ -11,6 +14,7 @@
 #include <torch/csrc/autograd/variable.h>
 
 #include <torch/csrc/autograd/functions/utils.h>
+#include <torch/csrc/autograd/jit_decomp_interface.h>
 #include <torch/csrc/utils/variadic.h>
 
 #include <array>
@@ -81,6 +85,18 @@ inline void throw_error_for_complex_autograd(
   }
 }
 
+inline void throw_error_if_base_and_tensor_are_same(
+    const at::Tensor& base,
+    const at::Tensor& tensor) {
+  TORCH_CHECK(
+      base.unsafeGetTensorImpl() != tensor.unsafeGetTensorImpl(),
+      "View operation returned a tensor that is the same as the input base tensor.  This "
+      "is no longer allowed; you must explicitly create a new tensor (e.g., using .detach()). "
+      "As a user, you could have made a mistake implementing __torch_dispatch__ or a Python "
+      "operator decomposition or meta registration; if that's not the case, please "
+      "report a bug to PyTorch or the backend you are using.");
+}
+
 inline void throw_error_for_complex_autograd(
     const at::TensorList& tensorlist,
     const char* name) {
@@ -167,6 +183,7 @@ inline at::Tensor as_view(
   // be used for both of them.
   if ((!diff_view_meta || diff_view_meta->shared_view_info()) &&
       is_bw_differentiable && is_fw_differentiable) {
+    throw_error_if_base_and_tensor_are_same(base, tensor);
     if (diff_view_meta) {
       creation_meta = propagate_creation_meta(
           diff_view_meta->get_creation_meta(), creation_meta);
@@ -220,6 +237,7 @@ inline at::Tensor as_view(
       creation_meta = propagate_creation_meta(
           diff_view_meta->get_creation_meta(), creation_meta);
     }
+    throw_error_if_base_and_tensor_are_same(base, tensor);
     return make_variable_differentiable_view(
         tensor,
         std::move(new_bw_info),
@@ -442,5 +460,58 @@ inline std::vector<c10::ScalarType> to_args_scalartypes(
   return args_scalartypes;
 }
 
+namespace impl {
+
+namespace {
+
+// If run_jit_decomposition were not a member function, we would be able
+// to pass this as a template parameter to c10::Boxedkernel::makeFromFunction.
+// However, member functions cannot be passed this way - instead we wrap our
+// call in this functor so it can be passed to c10::BoxedKernel::makeFromFunctor
+class WrapperFunctor final : public c10::OperatorKernel {
+ public:
+  WrapperFunctor(JitDecompInterface* impl) : impl_(impl){};
+
+  void operator()(
+      const c10::OperatorHandle& op,
+      c10::DispatchKeySet ks,
+      torch::jit::Stack* stack) {
+    impl_->run_jit_decomposition(op, stack);
+  }
+  JitDecompInterface* impl_;
+};
+
+} // namespace
+
+template <class Return, class... Args>
+Return run_jit_decomposition_with_args_for_jvp(
+    c10::string_view name,
+    const c10::OperatorHandle& opHandle,
+    c10::DispatchKeySet dispatchKeySet,
+    Args&&... args) {
+  // see NOTE: [Jit Decomposition Interface]
+  JitDecompInterface* impl = getJitDecompImpl();
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      impl && impl->has_jit_decomposition(opHandle.schema()),
+      "Trying to use forward AD with ",
+      name,
+      " that does not support it because it has not been implemented yet.\nPlease file an issue "
+      "to PyTorch at https://github.com/pytorch/pytorch/issues/new?template=feature-request.yml "
+      "so that we can prioritize its implementation.\n"
+      "Note that forward AD support for some operators require PyTorch to be built with "
+      "TorchScript and for JIT to be enabled. "
+      "If the environment var PYTORCH_JIT=0 is set or if the library is not built with TorchScript, "
+      "some operators may no longer be used with forward AD.");
+
+  return c10::KernelFunction::makeFromBoxedKernel(
+             c10::BoxedKernel::makeFromFunctor(
+                 std::make_unique<WrapperFunctor>(impl)))
+      .call<Return, Args...>(
+          opHandle, dispatchKeySet, std::forward<Args>(args)...);
+}
+
+} // namespace impl
+
 } // namespace autograd
 } // namespace torch
diff --git a/torch/csrc/autograd/custom_function.h b/torch/csrc/autograd/custom_function.h
index 7e35eef6783d..bc7489292c23 100644
--- a/torch/csrc/autograd/custom_function.h
+++ b/torch/csrc/autograd/custom_function.h
@@ -205,6 +205,12 @@ struct ExtractVariables : IterArgs<ExtractVariables> {
     is_var_.push_back(true);
     list_.emplace_back(x);
   }
+  void operator()(const at::TensorList& list) {
+    for (const at::Tensor& x : list) {
+      is_var_.push_back(true);
+      list_.emplace_back(x);
+    }
+  }
   template <typename T>
   void operator()(const T& x) {
     is_var_.push_back(false);
diff --git a/torch/csrc/autograd/functions/utils.h b/torch/csrc/autograd/functions/utils.h
index a2169f18656f..75df1a0302c9 100644
--- a/torch/csrc/autograd/functions/utils.h
+++ b/torch/csrc/autograd/functions/utils.h
@@ -100,5 +100,23 @@ inline bool isFwGradDefined(const c10::optional<at::Tensor>& t) {
   return t.has_value() && t->defined() && t->_fw_grad(/*level */ 0).defined();
 }
 
+inline bool isFwGradDefinedTensorList(const at::TensorList& variables) {
+  bool ret = false;
+  for (auto& variable : variables) {
+    ret |= isFwGradDefined(variable);
+  }
+  return ret;
+}
+
+inline bool isFwGradDefinedTensorList(
+    const c10::List<c10::optional<at::Tensor>> li) {
+  bool ret = false;
+  for (auto i : c10::irange(li.size())) {
+    auto t = li.get(i);
+    ret |= (t.has_value() && isFwGradDefined(t.value()));
+  }
+  return ret;
+}
+
 } // namespace autograd
 } // namespace torch
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index df97a86e5983..edfbab91c700 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -7,6 +7,7 @@
 #include <c10/core/DeviceType.h>
 #include <c10/core/InferenceMode.h>
 #include <c10/core/ScalarType.h>
+#include <c10/core/impl/PythonDispatcherTLS.h>
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/autograd/autograd.h>
 #include <torch/csrc/autograd/function.h>
@@ -52,6 +53,16 @@ struct EnableTorchFunction {
   bool old_;
 };
 
+struct EnablePythonDispatcher {
+  EnablePythonDispatcher() : old_(c10::impl::PythonDispatcherTLS::get_state()) {
+    c10::impl::PythonDispatcherTLS::set_state(getPyInterpreter());
+  }
+  ~EnablePythonDispatcher() {
+    c10::impl::PythonDispatcherTLS::set_state(old_);
+  }
+  c10::impl::PyInterpreter* old_;
+};
+
 } // namespace
 
 PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
@@ -328,6 +339,11 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
       .def(py::init<>());
   py::class_<EnableTorchFunction>(_C_m, "_EnableTorchFunction")
       .def(py::init<>());
+  py::class_<EnablePythonDispatcher>(_C_m, "_EnablePythonDispatcher")
+      .def(py::init<>());
+  py::class_<c10::impl::DisablePythonDispatcher>(
+      _C_m, "_DisablePythonDispatcher")
+      .def(py::init<>());
   py::class_<DisableFuncTorch>(_C_m, "_DisableFuncTorch").def(py::init<>());
 
   py::class_<torch::autograd::SavedVariable>(m, "SavedTensor")
diff --git a/torch/csrc/autograd/jit_decomp_interface.cpp b/torch/csrc/autograd/jit_decomp_interface.cpp
new file mode 100644
index 000000000000..a1372a48f15c
--- /dev/null
+++ b/torch/csrc/autograd/jit_decomp_interface.cpp
@@ -0,0 +1,21 @@
+#include <torch/csrc/autograd/jit_decomp_interface.h>
+
+namespace torch {
+namespace autograd {
+namespace impl {
+
+namespace {
+JitDecompInterface* impl = nullptr;
+}
+
+void setJitDecompImpl(JitDecompInterface* impl_) {
+  impl = impl_;
+}
+
+JitDecompInterface* getJitDecompImpl() {
+  return impl;
+}
+
+} // namespace impl
+} // namespace autograd
+} // namespace torch
diff --git a/torch/csrc/autograd/jit_decomp_interface.h b/torch/csrc/autograd/jit_decomp_interface.h
new file mode 100644
index 000000000000..42e33c41a0f0
--- /dev/null
+++ b/torch/csrc/autograd/jit_decomp_interface.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/function_schema.h>
+#include <c10/macros/Export.h>
+
+// NOTE: [Jit Decomposition Interface]
+//
+// For some context of why we need this at all, see NOTE: [forward-mode AD
+// decompositions mechanism]
+//
+// Introducing that mechanism from the NOTE is problematic because:
+// - it relies on TorchScript, so now VariableTypeX.cpp depends on TorchScript.
+// - there exist internal builds like lite_trainer, which depend on VariableType
+//   but do not depend on TorchScript.
+//
+// For internal builds like lite_trainer builds to pass, and for OSS builds that
+// do depend on TorchScript to still support the forward AD decomp mechanism, we
+// implement a PImpl pattern to avoid a static dependency in favor of a dynamic
+// one
+// - during static initialization time, if the library is built with TorchScript
+//   setJitDecompImpl is called in decomposition_registry.cpp setting a global
+//   ptr to the impl
+// - when the program is run,if getJitDecompImpl returns a non null ptr, we can
+//   carry on normally, otherwise we gracefully error out
+//
+// For extra context, see VariableHooksInterface.h, where a similar technique
+// is used
+
+namespace torch {
+namespace autograd {
+namespace impl {
+
+struct TORCH_API JitDecompInterface {
+  virtual ~JitDecompInterface() = default;
+  virtual bool has_jit_decomposition(
+      const c10::FunctionSchema& schema) const = 0;
+  virtual void run_jit_decomposition(
+      const c10::OperatorHandle& op,
+      jit::Stack* stack) const = 0;
+};
+
+TORCH_API void setJitDecompImpl(JitDecompInterface* impl);
+TORCH_API JitDecompInterface* getJitDecompImpl();
+
+struct TORCH_API JitDecompRegisterer {
+  explicit JitDecompRegisterer(JitDecompInterface* impl) {
+    setJitDecompImpl(impl);
+  }
+};
+
+} // namespace impl
+} // namespace autograd
+} // namespace torch
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 52304562e4ca..5fd9ad9bfafa 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -15,6 +15,7 @@
 #include <torch/csrc/profiler/itt_observer.h>
 #include <torch/csrc/profiler/kineto_shim.h>
 #include <torch/csrc/profiler/nvtx_observer.h>
+#include <torch/csrc/profiler/orchestration/observer.h>
 #include <torch/csrc/profiler/util.h>
 
 #include <ATen/Context.h>
@@ -60,7 +61,7 @@ using torch::profiler::impl::ActiveProfilerType;
 using torch::profiler::impl::dtypesToStr;
 using torch::profiler::impl::EventType;
 using torch::profiler::impl::ExtraFields;
-using torch::profiler::impl::ProfilerThreadLocalStateBase;
+using torch::profiler::impl::ProfilerStateBase;
 using torch::profiler::impl::PyExtraFieldsBase;
 using torch::profiler::impl::Result;
 using torch::profiler::impl::shapesToStr;
@@ -200,20 +201,21 @@ static inline uint64_t getForwardThreadKey(uint64_t tid, uint64_t seqNr) {
   return (((tid) << 48) | ((seqNr) & (((uint64_t)1 << 48) - 1)));
 }
 
-struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
+struct KinetoThreadLocalState : public ProfilerStateBase {
   explicit KinetoThreadLocalState(
       const ProfilerConfig& config,
       std::set<torch::profiler::impl::ActivityType> activities)
-      : ProfilerThreadLocalStateBase(config),
+      : ProfilerStateBase(config),
         start_time_(getTimeUs()),
         record_queue_(config, activities) {}
   ~KinetoThreadLocalState() override = default;
 
-  static KinetoThreadLocalState* getTLS() {
-    auto tls = ProfilerThreadLocalStateBase::getTLS();
+  static KinetoThreadLocalState* get(bool global) {
+    auto* state = ProfilerStateBase::get(/*global=*/global);
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        tls == nullptr || tls->profilerType() == ActiveProfilerType::KINETO);
-    return static_cast<KinetoThreadLocalState*>(tls);
+        state == nullptr ||
+        state->profilerType() == ActiveProfilerType::KINETO);
+    return static_cast<KinetoThreadLocalState*>(state);
   }
 
   ActiveProfilerType profilerType() override {
@@ -307,7 +309,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
             [this](ExtraFields<EventType::Backend>& i) { invokeCallback(i); },
             [](auto&) {}));
 
-        kineto_events_.emplace_back(e);
+        kineto_events_.emplace_back(e, config_.experimental_config.verbose);
         AddTensorboardFields add_tb(e, kineto_events_.back());
         AddGenericMetadata add_generic(e);
 
@@ -402,20 +404,10 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
   post_process_t event_post_process_cb_;
 };
 
-using KinetoTLSGlobalStateManager =
-    torch::profiler::impl::GlobalStateManager<KinetoThreadLocalState>;
-
-template <bool use_global>
-static KinetoThreadLocalState* getStatePtr() {
-  return c10::guts::if_constexpr<use_global>(
-      [] { return KinetoTLSGlobalStateManager::get(); },
-      [] { return KinetoThreadLocalState::getTLS(); });
-}
-
 template <bool use_global_state_ptr = false>
 std::unique_ptr<at::ObserverContext> onFunctionEnter(
     const at::RecordFunction& fn) {
-  auto state_ptr = getStatePtr<use_global_state_ptr>();
+  auto state_ptr = KinetoThreadLocalState::get(use_global_state_ptr);
   if (!state_ptr) {
     return nullptr;
   }
@@ -427,7 +419,7 @@ template <bool use_global_state_ptr = false>
 void onFunctionExit(
     const at::RecordFunction& fn,
     at::ObserverContext* ctx_ptr) {
-  auto state_ptr = getStatePtr<use_global_state_ptr>();
+  auto state_ptr = KinetoThreadLocalState::get(use_global_state_ptr);
   if (!state_ptr) {
     return;
   }
@@ -459,7 +451,8 @@ void onFunctionExit(
 
 template <bool use_global_callback = false>
 void pushProfilingCallbacks(const std::unordered_set<at::RecordScope>& scopes) {
-  auto registration_state_ptr = getStatePtr<use_global_callback>();
+  auto registration_state_ptr =
+      KinetoThreadLocalState::get(use_global_callback);
   TORCH_INTERNAL_ASSERT(registration_state_ptr, "Expected profiler state set");
   auto recordFunctionCallback =
       at::RecordFunctionCallback(
@@ -484,10 +477,10 @@ void reportBackendEventToActiveKinetoProfiler(
     const std::string& event_name,
     const std::string& backend_name) {
   TORCH_INTERNAL_ASSERT(
-      KinetoTLSGlobalStateManager::get() == nullptr,
+      KinetoThreadLocalState::get(/*global=*/true) == nullptr,
       "On-demand profiling does not support post processing callback");
 
-  auto state_ptr = KinetoThreadLocalState::getTLS();
+  auto state_ptr = KinetoThreadLocalState::get(/*global=*/false);
   if (!state_ptr) {
     return;
   }
@@ -535,11 +528,11 @@ void enableProfilerWithEventPostProcess(
       config.state != ProfilerState::ITT,
       "ITT does not support post processing callback.");
   TORCH_INTERNAL_ASSERT(
-      KinetoTLSGlobalStateManager::get() == nullptr,
+      KinetoThreadLocalState::get(/*global=*/true) == nullptr,
       "On-demand profiling does not support post processing callback");
 
   enableProfiler(config, activities, scopes);
-  auto state_ptr = KinetoThreadLocalState::getTLS();
+  auto state_ptr = KinetoThreadLocalState::get(config.global());
   state_ptr->setEventPostProcessingCallback(std::move(cb));
 }
 
@@ -547,7 +540,12 @@ void enableProfiler(
     const torch::profiler::impl::ProfilerConfig& config,
     const std::set<torch::profiler::impl::ActivityType>& activities,
     const std::unordered_set<at::RecordScope>& scopes) {
-  TORCH_CHECK(!profilerEnabled(), "Profiler is already enabled on this thread");
+  const auto has_cpu = activities.count(ActivityType::CPU);
+  TORCH_CHECK(
+      KinetoThreadLocalState::get(/*global=*/config.global()) == nullptr,
+      "Profiler is already enabled",
+      (config.global() ? "." : " on this thread."));
+
   if (config.state == ProfilerState::NVTX) {
     torch::profiler::impl::pushNVTXCallbacks(config, scopes);
     return;
@@ -559,34 +557,26 @@ void enableProfiler(
   TORCH_CHECK(
       config.state == ProfilerState::KINETO ||
       config.state == ProfilerState::KINETO_GPU_FALLBACK || config.global());
-  TORCH_CHECK(
-      !activities.empty(), "No activities specified for Kineto profiler");
+  TORCH_CHECK(!activities.empty(), "No activities specified.");
+  TORCH_INTERNAL_ASSERT(
+      has_cpu || !config.global(),
+      "Ondemand profiling must enable CPU tracing");
 
-  if (config.global()) {
-    KinetoTLSGlobalStateManager::init(config, activities);
+  KinetoThreadLocalState::push(
+      std::make_shared<KinetoThreadLocalState>(config, activities));
 
-    TORCH_INTERNAL_ASSERT(
-        activities.count(ActivityType::CPU),
-        "Ondemand profiling must enable CPU tracing");
-    pushProfilingCallbacks<true>(scopes);
-  } else {
-    auto state = std::make_shared<KinetoThreadLocalState>(config, activities);
-    c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state);
+  if (has_cpu) {
+    config.global() ? pushProfilingCallbacks</*global=*/true>(scopes)
+                    : pushProfilingCallbacks</*global=*/false>(scopes);
+  }
 
-    if (activities.count(ActivityType::CPU)) {
-      pushProfilingCallbacks<false>(scopes);
-    }
+  if (!config.global()) {
     torch::profiler::impl::kineto::startTrace();
   }
 }
 
 std::unique_ptr<ProfilerResult> disableProfiler() {
-  auto state_ptr = std::static_pointer_cast<
-      torch::profiler::impl::ProfilerThreadLocalStateBase>(
-      KinetoTLSGlobalStateManager::get() == nullptr
-          ? c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE)
-          : KinetoTLSGlobalStateManager::pop());
-
+  auto state_ptr = ProfilerStateBase::pop();
   const auto& config = state_ptr->config();
   TORCH_CHECK(
       state_ptr &&
@@ -600,7 +590,7 @@ std::unique_ptr<ProfilerResult> disableProfiler() {
   state_ptr->removeCallback();
 
   // Traces are converged via libkineto automatically for ondemand flow
-  if (state_ptr->config().state == ProfilerState::KINETO_ONDEMAND) {
+  if (state_ptr->config().global()) {
     (void)std::static_pointer_cast<KinetoThreadLocalState>(state_ptr)
         ->finalizeTrace();
     return std::make_unique<ProfilerResult>();
@@ -628,16 +618,19 @@ std::unique_ptr<ProfilerResult> disableProfiler() {
 }
 
 KinetoEvent::KinetoEvent(
-    std::shared_ptr<const torch::profiler::impl::Result> result)
+    std::shared_ptr<const torch::profiler::impl::Result> result,
+    const bool verbose)
     : result_{result} {
   TORCH_INTERNAL_ASSERT(result != nullptr);
 
-  // Populate Python stack
-  auto parent = result_->parent_.lock();
-  while (parent != nullptr) {
-    parent->visit_if_base<PyExtraFieldsBase>(
-        [&](const auto& i) { python_stack_.push_back(parent->name()); });
-    parent = parent->parent_.lock();
+  if (verbose) {
+    // Populate Python stack
+    auto parent = result_->parent_.lock();
+    while (parent != nullptr) {
+      parent->visit_if_base<PyExtraFieldsBase>(
+          [&](const auto& i) { python_stack_.push_back(parent->name()); });
+      parent = parent->parent_.lock();
+    }
   }
 }
 
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
index c924e8074cf9..6a77f58abe42 100644
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -20,7 +20,9 @@ namespace profiler {
 using experimental_event_t = std::shared_ptr<torch::profiler::impl::Result>;
 
 struct TORCH_API KinetoEvent {
-  explicit KinetoEvent(std::shared_ptr<const torch::profiler::impl::Result>);
+  KinetoEvent(
+      std::shared_ptr<const torch::profiler::impl::Result>,
+      const bool verbose);
 
   uint64_t startThreadId() const;
   uint64_t endThreadId() const;
diff --git a/torch/csrc/autograd/profiler_legacy.cpp b/torch/csrc/autograd/profiler_legacy.cpp
index ccc9365d2dec..f77b4f5928b3 100644
--- a/torch/csrc/autograd/profiler_legacy.cpp
+++ b/torch/csrc/autograd/profiler_legacy.cpp
@@ -120,18 +120,17 @@ namespace profiler {
 
 namespace {
 using torch::profiler::impl::ActiveProfilerType;
-using torch::profiler::impl::ProfilerThreadLocalStateBase;
+using torch::profiler::impl::ProfilerStateBase;
 
-struct ProfilerLegacyThreadLocalState : public ProfilerThreadLocalStateBase {
+struct ProfilerLegacyThreadLocalState : public ProfilerStateBase {
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   explicit ProfilerLegacyThreadLocalState(
       const torch::profiler::impl::ProfilerConfig& config)
-      : ProfilerThreadLocalStateBase(config),
-        remoteProfiledEvents_{c10::nullopt} {}
+      : ProfilerStateBase(config), remoteProfiledEvents_{c10::nullopt} {}
   ~ProfilerLegacyThreadLocalState() override = default;
 
   static ProfilerLegacyThreadLocalState* getTLS() {
-    auto tls = ProfilerThreadLocalStateBase::getTLS();
+    auto tls = ProfilerStateBase::get(/*global=*/false);
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
         tls == nullptr || tls->profilerType() == ActiveProfilerType::LEGACY);
     return static_cast<ProfilerLegacyThreadLocalState*>(tls);
diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp
index 5c4fdcf0ef64..debc1f6f8327 100644
--- a/torch/csrc/autograd/profiler_python.cpp
+++ b/torch/csrc/autograd/profiler_python.cpp
@@ -1,9 +1,6 @@
 #include <torch/csrc/autograd/profiler_python.h>
 
-#include <ATen/core/TensorBase.h>
-#include <c10/util/Exception.h>
-#include <c10/util/Logging.h>
-#include <c10/util/StringUtil.h>
+#include <atomic>
 #include <cstdint>
 #include <deque>
 #include <iostream>
@@ -17,13 +14,18 @@
 #include <Python.h>
 #include <frameobject.h>
 
+#include <ATen/core/TensorBase.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/C++17.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Logging.h>
+#include <c10/util/StringUtil.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/autograd/python_variable.h>
 #include <torch/csrc/profiler/collection.h>
 #include <torch/csrc/profiler/containers.h>
+#include <torch/csrc/profiler/orchestration/python_tracer.h>
 #include <torch/csrc/profiler/util.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/python_compat.h>
@@ -75,6 +77,14 @@ PyCodeObject* nnModuleCode() {
   return module_call_code;
 }
 
+template <CallType C>
+PyCodeObject* getCode();
+
+template <>
+PyCodeObject* getCode<CallType::PyModuleCall>() {
+  return nnModuleCode();
+};
+
 } // namespace
 } // namespace impl
 } // namespace profiler
@@ -186,13 +196,13 @@ struct Config<CallType::PyCall> {
 template <>
 struct Config<CallType::PyModuleCall> {
   using key_t = PyModuleSelf;
+  using cls_t = PyModuleCls;
   using ephemeral_t = PyFrameObject*;
-  using info_t =
-      std::pair<PyModuleCls, std::vector<std::pair<std::string, void*>>>;
+  using info_t = std::pair<cls_t, std::vector<std::pair<std::string, void*>>>;
   struct cache_t {
-    c10::optional<CodeLocation> module_forward_;
-    ska::flat_hash_map<PyModuleSelf, info_t> modules_and_params_;
-    ska::flat_hash_map<PyModuleCls, at::StringView> module_cls_names_;
+    c10::optional<CodeLocation> location_; // nn.Module.forward;
+    ska::flat_hash_map<key_t, info_t> modules_and_params_;
+    ska::flat_hash_map<cls_t, at::StringView> cls_names_;
   };
   static constexpr EventType event_type = EventType::PyCall;
 };
@@ -229,6 +239,13 @@ class Callsite {
   Config<CallType::PyCall>::key_t caller_;
 };
 
+// ============================================================================
+// == Type specific store and load implementations. ===========================
+// ============================================================================
+using PyCallKey = Config<CallType::PyCall>::key_t;
+using PyModuleCallKey = Config<CallType::PyModuleCall>::key_t;
+using PyCCallKey = Config<CallType::PyCCall>::key_t;
+
 class ValueCache {
  public:
   template <CallType C>
@@ -258,12 +275,27 @@ class ValueCache {
   CallTypeHelper<State>::tuple_type state_;
 };
 
-// ============================================================================
-// == Type specific store and load implementations. ===========================
-// ============================================================================
-using PyCallKey = Config<CallType::PyCall>::key_t;
-using PyModuleCallKey = Config<CallType::PyModuleCall>::key_t;
-using PyCCallKey = Config<CallType::PyCCall>::key_t;
+template <CallType C>
+typename Config<C>::cls_t set_class(
+    ValueCache* value_cache,
+    typename Config<C>::cache_t& cache,
+    const typename Config<C>::key_t& key,
+    const typename Config<C>::ephemeral_t& frame) {
+  if (C10_UNLIKELY(!cache.location_.has_value())) {
+    auto code = THPCodeObjectPtr(PyFrame_GetCode(frame));
+    TORCH_INTERNAL_ASSERT(code.get() == getCode<C>());
+    cache.location_ = PyCallKey(frame);
+    value_cache->store<CallType::PyCall>(*cache.location_, no_ephemeral_t());
+  }
+
+  auto cls_handle = py::handle((PyObject*)key).attr("__class__");
+  auto cls = typename Config<C>::cls_t(cls_handle.ptr());
+  if (cache.cls_names_.find(cls) == cache.cls_names_.end()) {
+    cache.cls_names_[cls] =
+        at::StringView(py::str(cls_handle.attr("__name__")));
+  }
+  return cls;
+}
 
 template <>
 void ValueCache::store<CallType::PyCall>(const PyCallKey& key, no_ephemeral_t) {
@@ -290,12 +322,7 @@ void ValueCache::store<CallType::PyModuleCall>(
   if (C10_UNLIKELY(
           cache.modules_and_params_.find(key) ==
           cache.modules_and_params_.end())) {
-    if (C10_UNLIKELY(!cache.module_forward_.has_value())) {
-      auto code = THPCodeObjectPtr(PyFrame_GetCode(frame));
-      TORCH_INTERNAL_ASSERT(code.get() == nnModuleCode());
-      cache.module_forward_ = PyCallKey(frame);
-      store<CallType::PyCall>(*cache.module_forward_, no_ephemeral_t());
-    }
+    auto cls = set_class<CallType::PyModuleCall>(this, cache, key, frame);
 
     py::dict params = py::handle((PyObject*)key).attr("_parameters");
     std::vector<std::pair<std::string, void*>> params_;
@@ -308,14 +335,7 @@ void ValueCache::store<CallType::PyModuleCall>(
         }
       }
     }
-    auto cls_handle = py::handle((PyObject*)key).attr("__class__");
-    auto cls = PyModuleCls(cls_handle.ptr());
     cache.modules_and_params_[key] = make_pair(cls, params_);
-
-    if (cache.module_cls_names_.find(cls) == cache.module_cls_names_.end()) {
-      cache.module_cls_names_[cls] =
-          at::StringView(py::str(cls_handle.attr("__name__")));
-    }
   }
 }
 
@@ -323,15 +343,15 @@ template <>
 ExtraFields<EventType::PyCall>::args_t ValueCache::load<CallType::PyModuleCall>(
     const PyModuleCallKey& key) const {
   auto& cache = std::get<CallType::PyModuleCall>(state_);
-  TORCH_INTERNAL_ASSERT(cache.module_forward_.has_value());
+  TORCH_INTERNAL_ASSERT(cache.location_.has_value());
   auto cls = cache.modules_and_params_.at(key).first;
-  auto fwd = std::get<CallType::PyCall>(state_).at(*cache.module_forward_);
+  auto fwd = std::get<CallType::PyCall>(state_).at(*cache.location_);
   return {
       fwd,
       NNModuleInfo{
           key,
           cls,
-          cache.module_cls_names_.at(cls),
+          cache.cls_names_.at(cls),
           cache.modules_and_params_.at(key).second}};
 }
 
@@ -420,7 +440,8 @@ struct TraceKeyCacheState {
 // `PyEval_SetProfile`.
 struct ThreadLocalResults;
 struct TraceContext {
-  PyObject_HEAD ThreadLocalResults* thread_local_results_;
+  PyObject_HEAD;
+  ThreadLocalResults* thread_local_results_;
 };
 
 // CPython boilerplate to define `TraceContext` as a proper python object.
@@ -469,11 +490,16 @@ static PyTypeObject TraceContextType = {
 // ============================================================================
 // == Thread local cache ======================================================
 // ============================================================================
+class PythonTracer;
 struct ThreadLocalResults {
-  ThreadLocalResults(PyThreadState* thread_state, ValueCache* value_cache)
+  ThreadLocalResults(
+      PyThreadState* thread_state,
+      ValueCache* value_cache,
+      PythonTracer* active_tracer)
       : thread_state_{thread_state},
         ctx_{(TraceContext*)TraceContextType.tp_alloc(&TraceContextType, 0)},
-        value_cache_{value_cache} {
+        value_cache_{value_cache},
+        active_tracer_{active_tracer} {
     ctx_->thread_local_results_ = this;
   }
 
@@ -501,6 +527,7 @@ struct ThreadLocalResults {
   PyThreadState* thread_state_;
   TraceContext* ctx_;
   ValueCache* value_cache_;
+  PythonTracer* active_tracer_;
   CallTypeHelper<TraceKeyCacheState>::tuple_type trace_keys_;
   AppendOnlyList<approx_time_t, BLOCK_SIZE> exit_times_;
   AppendOnlyList<approx_time_t, BLOCK_SIZE> c_exit_times_;
@@ -511,30 +538,31 @@ struct ThreadLocalResults {
 // ============================================================================
 class PythonTracer final : public python_tracer::PythonTracerBase {
  public:
+  PythonTracer(torch::profiler::impl::RecordQueue* queue);
+  ~PythonTracer() override;
+
   static int pyProfileFn(
       PyObject* obj,
       PyFrameObject* frame,
       int what,
       PyObject* arg);
 
-  static PythonTracer& singleton();
-  void start(torch::profiler::impl::RecordQueue* queue) override;
   void stop() override;
   std::vector<std::shared_ptr<Result>> getEvents(
       std::function<time_t(approx_time_t)> time_converter,
       std::vector<python_tracer::CompressedEvent>& enters,
       time_t end_time_ns) override;
-  void clear() override;
 
  private:
-  PythonTracer();
-
   void recordPyCall(ThreadLocalResults& tls, PyFrameObject* frame);
   void recordCCall(
       ThreadLocalResults& tls,
       PyFrameObject* frame,
       PyObject* arg);
 
+  std::atomic<bool> active_lock_{false};
+  bool active_{false};
+
   torch::profiler::impl::RecordQueue* queue_;
   PyCodeObject* module_call_code_;
 
@@ -542,20 +570,18 @@ class PythonTracer final : public python_tracer::PythonTracerBase {
   ValueCache value_cache_;
 };
 
-PythonTracer& PythonTracer::singleton() {
-  static PythonTracer singleton_;
-  return singleton_;
-}
-
-PythonTracer::PythonTracer()
-    : queue_(nullptr), module_call_code_(nnModuleCode()) {}
-
-void PythonTracer::start(torch::profiler::impl::RecordQueue* queue) {
-  TORCH_CHECK(queue_ == nullptr, "PythonTracer is already active")
-  TORCH_CHECK(
-      !thread_local_results_.size(),
-      "PythonTracer should not have active contexts");
-  queue_ = queue;
+PythonTracer::PythonTracer(torch::profiler::impl::RecordQueue* queue)
+    : queue_(queue), module_call_code_(nnModuleCode()) {
+  TORCH_CHECK(queue_ != nullptr);
+
+  bool expected{false};
+  active_ = active_lock_.compare_exchange_strong(expected, true);
+  if (!active_) {
+    TORCH_WARN(
+        "There is already an active Python tracer. "
+        "Refusing to register profile functions.");
+    return;
+  }
 
   pybind11::gil_scoped_acquire gil;
 
@@ -582,7 +608,7 @@ void PythonTracer::start(torch::profiler::impl::RecordQueue* queue) {
     PyThreadState* thread_state = thread_states[i];
     PyThreadState_Swap(thread_state);
 
-    thread_local_results_.emplace_back(thread_state, &value_cache_);
+    thread_local_results_.emplace_back(thread_state, &value_cache_, this);
     auto* ctx = thread_local_results_.back().ctx_;
 
     // When we begin profiling there are already frames on the Python
@@ -613,24 +639,26 @@ void PythonTracer::start(torch::profiler::impl::RecordQueue* queue) {
 };
 
 void PythonTracer::stop() {
-  TORCH_INTERNAL_ASSERT(queue_ != nullptr, "PythonTracer is not running.")
-  queue_ = nullptr;
-
   pybind11::gil_scoped_acquire gil;
+  if (active_) {
+    PyThreadState* initial_thread_state = PyThreadState_Get();
+    for (const auto& i : thread_local_results_) {
+      PyThreadState_Swap(i.thread_state_);
+      PyEval_SetProfile(nullptr, nullptr);
+    }
+    PyThreadState_Swap(initial_thread_state);
 
-  PyThreadState* initial_thread_state = PyThreadState_Get();
-  for (const auto& i : thread_local_results_) {
-    PyThreadState_Swap(i.thread_state_);
-    PyEval_SetProfile(nullptr, nullptr);
+    auto lock_returned = active_lock_.compare_exchange_strong(active_, false);
+    active_ = false;
+    SOFT_ASSERT(lock_returned, "Failed to return python tracer lock.");
   }
-  PyThreadState_Swap(initial_thread_state);
 }
 
-void PythonTracer::clear() {
-  TORCH_CHECK(
-      queue_ == nullptr, "Cannot clear state while PythonTracer is active.");
-  thread_local_results_.clear();
-  value_cache_ = ValueCache();
+PythonTracer::~PythonTracer() {
+  if (active_) {
+    TORCH_WARN("`PythonTracer::stop()` was not called.");
+    stop();
+  }
 }
 
 void PythonTracer::recordPyCall(ThreadLocalResults& tls, PyFrameObject* frame) {
@@ -849,11 +877,11 @@ int PythonTracer::pyProfileFn(
       *reinterpret_cast<TraceContext*>(obj)->thread_local_results_;
   switch (what) {
     case PyTrace_CALL:
-      PythonTracer::singleton().recordPyCall(local_results, frame);
+      local_results.active_tracer_->recordPyCall(local_results, frame);
       break;
 
     case PyTrace_C_CALL:
-      PythonTracer::singleton().recordCCall(local_results, frame, arg);
+      local_results.active_tracer_->recordCCall(local_results, frame, arg);
       break;
 
     case PyTrace_EXCEPTION:
@@ -869,8 +897,9 @@ int PythonTracer::pyProfileFn(
   return 0;
 }
 
-python_tracer::PythonTracerBase& getTracer() {
-  return PythonTracer::singleton();
+std::unique_ptr<python_tracer::PythonTracerBase> getTracer(
+    torch::profiler::impl::RecordQueue* queue) {
+  return std::make_unique<PythonTracer>(queue);
 }
 } // namespace
 } // namespace impl
diff --git a/torch/csrc/autograd/python_nested_functions.h b/torch/csrc/autograd/python_nested_functions.h
new file mode 100644
index 000000000000..8b0bf9c115d1
--- /dev/null
+++ b/torch/csrc/autograd/python_nested_functions.h
@@ -0,0 +1,9 @@
+#pragma once
+
+namespace torch {
+namespace autograd {
+
+void initNestedFunctions(PyObject* module);
+
+}
+} // namespace torch
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index ce2850e59c9f..3386aac9f47b 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -3,6 +3,7 @@
 #include <c10/core/DeviceType.h>
 #include <c10/core/SafePyObject.h>
 #include <c10/core/impl/GPUTrace.h>
+#include <c10/core/impl/PythonDispatcherTLS.h>
 #include <c10/util/DeadlockDetection.h>
 #include <c10/util/irange.h>
 #include <pybind11/pytypes.h>
@@ -218,6 +219,12 @@ static constexpr char trace_cuda_memory_deallocation_fn_name[] =
     "CUDAMemoryDeallocationCallbacks";
 static constexpr char trace_cuda_stream_creation_fn_name[] =
     "CUDAStreamCreationCallbacks";
+static constexpr char trace_cuda_device_synchronization_fn_name[] =
+    "CUDADeviceSynchronizationCallbacks";
+static constexpr char trace_cuda_stream_synchronization_fn_name[] =
+    "CUDAStreamSynchronizationCallbacks";
+static constexpr char trace_cuda_event_synchronization_fn_name[] =
+    "CUDAEventSynchronizationCallbacks";
 
 struct ConcretePyInterpreterVTable final
     : public c10::impl::PyInterpreterVTable {
@@ -229,6 +236,10 @@ struct ConcretePyInterpreterVTable final
 
   void dispatch(const c10::OperatorHandle& op, torch::jit::Stack* stack)
       const override;
+  void python_dispatcher(
+      const c10::OperatorHandle& op,
+      c10::DispatchKeySet,
+      torch::jit::Stack* stack) const override;
 
   bool is_contiguous(const TensorImpl* self) const override;
   c10::Device device(const TensorImpl* self) const override;
@@ -239,6 +250,7 @@ struct ConcretePyInterpreterVTable final
   c10::Layout layout(const TensorImpl* self) const override;
   c10::SymInt sym_numel(const TensorImpl* self) const override;
   c10::SymIntArrayRef sym_strides(const TensorImpl* self) const override;
+  c10::SymInt sym_storage_offset(const TensorImpl* self) const override;
 
   void trace_gpu_event_creation(uintptr_t event) const override {
     concrete_trace_cuda<trace_cuda_event_creation_fn_name>(event);
@@ -262,6 +274,15 @@ struct ConcretePyInterpreterVTable final
   void trace_gpu_stream_creation(uintptr_t stream) const override {
     concrete_trace_cuda<trace_cuda_stream_creation_fn_name>(stream);
   }
+  void trace_gpu_device_synchronization() const override {
+    concrete_trace_cuda<trace_cuda_device_synchronization_fn_name>();
+  }
+  void trace_gpu_stream_synchronization(uintptr_t stream) const override {
+    concrete_trace_cuda<trace_cuda_stream_synchronization_fn_name>(stream);
+  }
+  void trace_gpu_event_synchronization(uintptr_t event) const override {
+    concrete_trace_cuda<trace_cuda_event_synchronization_fn_name>(event);
+  }
 
   static ConcretePyInterpreterVTable* instance() {
     static ConcretePyInterpreterVTable s;
@@ -685,6 +706,7 @@ static PyObject* THPVariable_make_subclass(
         "cls must be a type (got %s)", Py_TYPE(cls)->tp_name);
   }
   torch_dispatch_mode::StashTorchDispatchModeGuard td_g;
+  c10::impl::DisablePythonDispatcher dpd_g;
   auto data =
       r.tensor(1).detach(); // creates a fresh Tensor (DEFINITELY_UNINITIALIZED)
   // We set `data`'s `allow_tensor_metadata_change` to true here, because we
@@ -700,14 +722,14 @@ static PyObject* THPVariable_make_subclass(
   data.set_requires_grad(r.toBool(2));
   const auto sizes_strides_policy = r.stringViewOptional(3);
   if (sizes_strides_policy.has_value()) {
-    data.unsafeGetTensorImpl()->set_sizes_strides_policy(
+    data.unsafeGetTensorImpl()->set_python_custom_sizes_strides(
         parseSizesStridesPolicyArgument(*sizes_strides_policy));
   }
   if (r.toBool(4)) {
-    data.unsafeGetTensorImpl()->set_custom_device(true);
+    data.unsafeGetTensorImpl()->set_python_custom_device(true);
   }
   if (r.toBool(5)) {
-    data.unsafeGetTensorImpl()->set_custom_layout(true);
+    data.unsafeGetTensorImpl()->set_python_custom_layout(true);
   }
   if (!r.isNone(6)) {
     data.unsafeGetTensorImpl()->_change_backend_component_keys(r.device(6));
@@ -789,7 +811,7 @@ static PyObject* THPVariable_make_wrapper_subclass(
 
     const auto sizes_strides_policy = r.stringViewOptional(10);
     if (sizes_strides_policy.has_value()) {
-      tensor.unsafeGetTensorImpl()->set_sizes_strides_policy(
+      tensor.unsafeGetTensorImpl()->set_python_custom_sizes_strides(
           parseSizesStridesPolicyArgument(*sizes_strides_policy));
     }
   } else {
@@ -804,17 +826,12 @@ static PyObject* THPVariable_make_wrapper_subclass(
 
     auto sym_sizes = r.symintlist(1);
     auto sym_strides = r.symintlist(2);
+    auto sym_storage_offset = r.toSymIntOptional(3);
 
     TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl();
 
-    // TODO: this should probably be sym_sizes, sym_strides AND offset
-    tensor_impl->set_sym_sizes_and_strides(sym_sizes, sym_strides);
-
-    // TODO: this may need to be symbolic as well
-    auto storage_offset = r.toInt64Optional(3);
-    if (storage_offset) {
-      tensor_impl->set_storage_offset(*storage_offset);
-    }
+    tensor_impl->set_sizes_and_strides(
+        sym_sizes, sym_strides, sym_storage_offset.value_or(0));
 
     const auto sizes_strides_policy = r.stringViewOptional(10);
     if (sizes_strides_policy.has_value()) {
@@ -827,10 +844,10 @@ static PyObject* THPVariable_make_wrapper_subclass(
   tensor.set_requires_grad(r.toBool(9));
 
   if (r.toBool(11)) {
-    tensor.unsafeGetTensorImpl()->set_custom_device(true);
+    tensor.unsafeGetTensorImpl()->set_python_custom_device(true);
   }
   if (r.toBool(12)) {
-    tensor.unsafeGetTensorImpl()->set_custom_layout(true);
+    tensor.unsafeGetTensorImpl()->set_python_custom_layout(true);
   }
 
   return THPVariable_NewWithVar(
@@ -2178,6 +2195,30 @@ py::object torchDispatchFromTensorImpl(
           TorchFunctionName::TorchDispatch));
 }
 
+py::handle getTorchApiFunction(const c10::OperatorHandle& op) {
+  return op.getPythonOp(getPyInterpreter(), [&]() -> PyObject* {
+    // Parse the name into namespace and name (no overload_name)
+    // TODO: put this into the library
+    const auto& schema = op.schema();
+    const auto& qualified_name = op.operator_name().name;
+    const auto& overload_name = schema.overload_name();
+    auto pos = qualified_name.find("::");
+    TORCH_INTERNAL_ASSERT(pos != std::string::npos, qualified_name);
+    // Make me some null terminated strings
+    std::string ns_str = qualified_name.substr(0, pos);
+    const char* ns = ns_str.c_str();
+    const char* func_name = qualified_name.c_str() + pos + strlen("::");
+
+    py::handle torch_api_function =
+        py::module::import("torch").attr("ops").attr(ns).attr(func_name);
+    if (overload_name == "") {
+      return torch_api_function.attr("default").ptr();
+    } else {
+      return torch_api_function.attr(overload_name.c_str()).ptr();
+    }
+  });
+}
+
 void ConcretePyInterpreterVTable::dispatch(
     const c10::OperatorHandle& op,
     torch::jit::Stack* stack) const {
@@ -2185,17 +2226,6 @@ void ConcretePyInterpreterVTable::dispatch(
   const auto num_arguments = schema.arguments().size();
   auto arguments = torch::jit::pop(*stack, num_arguments);
 
-  // Parse the name into namespace and name (no overload_name)
-  // TODO: put this into the library
-  const auto& qualified_name = op.operator_name().name;
-  const auto& overload_name = schema.overload_name();
-  auto pos = qualified_name.find("::");
-  TORCH_INTERNAL_ASSERT(pos != std::string::npos, qualified_name);
-  // Make me some null terminated strings
-  std::string ns_str = qualified_name.substr(0, pos);
-  const char* ns = ns_str.c_str();
-  const char* func_name = qualified_name.c_str() + pos + strlen("::");
-
   // The plan: convert all the arguments back into PyObjects,
   // extracting out the tensor handles, then call
   // handle_torch_function_no_python_arg_parser
@@ -2205,16 +2235,7 @@ void ConcretePyInterpreterVTable::dispatch(
   py::gil_scoped_acquire g;
 
   std::vector<py::handle> overloaded_args;
-  py::handle torch_api_function =
-      py::module::import("torch").attr("ops").attr(ns).attr(func_name);
-  py::handle torch_api_function_overload;
-  if (overload_name == "") {
-    torch_api_function_overload = torch_api_function.attr("default");
-  } else {
-    torch_api_function_overload =
-        torch_api_function.attr(overload_name.c_str());
-  }
-  std::string module_name_str = "torch.ops." + ns_str;
+  py::handle torch_api_function_overload = getTorchApiFunction(op);
 
   // Find overloaded tensors
   for (const auto idx : c10::irange(arguments.size())) {
@@ -2246,14 +2267,51 @@ void ConcretePyInterpreterVTable::dispatch(
       overloaded_args,
       args.ptr(),
       kwargs.ptr(),
-      func_name,
+      nullptr,
       torch_api_function_overload.ptr(),
-      module_name_str.c_str(),
+      nullptr,
       TorchFunctionName::TorchDispatch);
   pushPyOutToStack(
       op, stack, py::reinterpret_steal<py::object>(obj), "__torch_dispatch__");
 }
 
+void ConcretePyInterpreterVTable::python_dispatcher(
+    const c10::OperatorHandle& op,
+    c10::DispatchKeySet ks,
+    torch::jit::Stack* stack) const {
+  py::gil_scoped_acquire g;
+  py::handle torch_api_function_overload = getTorchApiFunction(op);
+
+  c10::DispatchKey k = ks.highestPriorityTypeId();
+  auto handler = torch_api_function_overload.attr(toString(k));
+  if (handler.ptr() == nullptr) {
+    throw python_error();
+  }
+  if (py::isinstance<c10::DispatchKey>(handler)) {
+    // NB: not redispatch, as that will permanently remove the python
+    // dispatcher for subsequent redispatches
+    op.callBoxedForDispatchKey(py::cast<c10::DispatchKey>(handler), *stack);
+    return;
+  }
+
+  const auto& schema = op.schema();
+  const auto num_arguments = schema.arguments().size();
+  auto arguments = torch::jit::pop(*stack, num_arguments);
+
+  auto args_kwargs = parseIValuesToPyArgsKwargs(op, arguments);
+  auto args = std::move(args_kwargs.first);
+  auto kwargs = std::move(args_kwargs.second);
+
+  py::object obj = py::reinterpret_steal<py::object>(
+      PyObject_Call(handler.ptr(), args.ptr(), kwargs.ptr()));
+
+  if (obj == nullptr) {
+    throw python_error();
+  }
+
+  pushPyOutToStack(op, stack, std::move(obj), "Python dispatcher");
+}
+
 c10::intrusive_ptr<TensorImpl> ConcretePyInterpreterVTable::detach(
     const c10::TensorImpl* self) const {
   pybind11::gil_scoped_acquire gil;
@@ -2366,7 +2424,7 @@ c10::IntArrayRef ConcretePyInterpreterVTable::strides(
   if (out == Py_None) {
     TORCH_CHECK(
         !self->has_symbolic_sizes_strides(),
-        "Cannot call sizes on a tensor with symbolic shapes/strides");
+        "Cannot call strides on a tensor with symbolic shapes/strides");
     return self->strides_default();
   }
 
@@ -2527,6 +2585,29 @@ c10::SymInt ConcretePyInterpreterVTable::sym_numel(
       : c10::SymInt{py::cast<int64_t>(out)};
 }
 
+c10::SymInt ConcretePyInterpreterVTable::sym_storage_offset(
+    const c10::TensorImpl* self) const {
+  pybind11::gil_scoped_acquire gil;
+  at::impl::MaybeSetTLSOnEntryGuard guard;
+  auto out = torchDispatchFromTensorImpl(
+      self,
+      "sym_storage_offset",
+      py::module::import("torch")
+          .attr("ops")
+          .attr("aten")
+          .attr("sym_storage_offset")
+          .attr("default")
+          .ptr(),
+      "torch.ops.aten");
+
+  if (out == Py_None) {
+    return self->sym_storage_offset_default();
+  }
+  return torch::is_symint_node(out)
+      ? out.cast<c10::SymIntNodeImpl*>()->toSymInt()
+      : c10::SymInt{py::cast<int64_t>(out)};
+}
+
 c10::SymIntArrayRef ConcretePyInterpreterVTable::sym_strides(
     const c10::TensorImpl* self) const {
   pybind11::gil_scoped_acquire gil;
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index b9603696dce2..49905fe803f4 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -687,37 +687,26 @@ inline Variable make_variable_differentiable_view(
     CreationMeta creation_meta,
     bool allow_tensor_metadata_change = true) {
   if (data.defined()) {
-    // If we already did a TensorImpl allocation for data, just reuse it.
-    // Otherwise(e.g tensor.swapdim(0, 0) when we return the same tensor as
-    // input), we have to use shallow_copy_and_detach to create a new TensorImpl
-    // to avoid moving leaf node into graph interior. This guarantees only 1
-    // TensorImpl allocation happens in view ops.
-    if (data.getIntrusivePtr().unique() &&
-        data.getIntrusivePtr()->unique_version()) {
-      at::TensorImpl* data_impl = data.unsafeGetTensorImpl();
-      data_impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
-      data_impl->set_autograd_meta(std::make_unique<DifferentiableViewMeta>(
-          data_impl,
-          std::move(backward_info),
-          std::move(forward_info),
-          shared_view_info,
-          creation_meta));
-      return data;
-    } else {
-      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-      c10::intrusive_ptr<at::TensorImpl> data_impl_copy =
-          data.getIntrusivePtr()->shallow_copy_and_detach(
-              /*version_counter=*/0,
-              /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
-      data_impl_copy->set_autograd_meta(
-          std::make_unique<DifferentiableViewMeta>(
-              data_impl_copy.get(),
-              std::move(backward_info),
-              std::move(forward_info),
-              shared_view_info,
-              creation_meta));
-      return Variable(data_impl_copy);
-    }
+    TORCH_CHECK(
+        data.getIntrusivePtr()->autograd_meta() == nullptr,
+        "Attempted to make a tensor into a differentiable view, but the "
+        "tensor already had autograd metadata associated with it.  If you are "
+        "using a __torch_dispatch__ mode, the most common cause for this "
+        "problem is that you used torch.overrides.enable_reentrant_dispatch() "
+        "improperly; tensors created within the extent of reentrant dispatch "
+        "MUST NOT be directly returned from __torch_dispatch__; instead, they "
+        "must be wrapped into fresh tensors that serve as the output.  If you "
+        "are not using wrappers, you probably don't need reentrant dispatch.  "
+        "If this doesn't seem applicable, please file a bug to PyTorch.");
+    at::TensorImpl* data_impl = data.unsafeGetTensorImpl();
+    data_impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
+    data_impl->set_autograd_meta(std::make_unique<DifferentiableViewMeta>(
+        data_impl,
+        std::move(backward_info),
+        std::move(forward_info),
+        shared_view_info,
+        creation_meta));
+    return data;
   }
   return Variable();
 }
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index 0012d1ae596c..e421850da892 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -344,6 +344,16 @@ PyObject* THCPModule_cudaCachingAllocator_raw_delete(
   END_HANDLE_TH_ERRORS
 }
 
+PyObject* THCPModule_cudaCachingAllocator_set_allocator_settings(
+    PyObject* _unused,
+    PyObject* env) {
+  HANDLE_TH_ERRORS
+  c10::cuda::CUDACachingAllocator::setAllocatorSettings(
+      THPUtils_unpackString(env));
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
 PyObject* THCPModule_cudaSynchronize(PyObject* _unused, PyObject* noargs) {
   HANDLE_TH_ERRORS
   c10::cuda::device_synchronize();
@@ -927,6 +937,10 @@ static struct PyMethodDef _THCPModule_methods[] = {
      THCPModule_cudaCachingAllocator_raw_delete,
      METH_O,
      nullptr},
+    {"_cuda_cudaCachingAllocator_set_allocator_settings",
+     THCPModule_cudaCachingAllocator_set_allocator_settings,
+     METH_O,
+     nullptr},
     {"_cuda_synchronize", THCPModule_cudaSynchronize, METH_NOARGS, nullptr},
     {"_cuda_ipc_collect", THCPModule_cudaIPCCollect, METH_NOARGS, nullptr},
     {"_cuda_sleep", THCPModule_cudaSleep, METH_O, nullptr},
diff --git a/torch/csrc/deploy/environment.h b/torch/csrc/deploy/environment.h
index 433ce6bcb3f6..5837b59a1b3b 100644
--- a/torch/csrc/deploy/environment.h
+++ b/torch/csrc/deploy/environment.h
@@ -57,7 +57,7 @@ class Environment {
   }
   virtual ~Environment() {
     auto rmCmd = fmt::format("rm -rf {}", extraPythonLibrariesDir_);
-    system(rmCmd.c_str());
+    (void)system(rmCmd.c_str());
   }
   virtual void configureInterpreter(Interpreter* interp) = 0;
   virtual const std::vector<std::string>& getExtraPythonPaths() {
diff --git a/torch/csrc/distributed/c10d/Backend.hpp b/torch/csrc/distributed/c10d/Backend.hpp
index 3b27258ead4b..9986569d286e 100644
--- a/torch/csrc/distributed/c10d/Backend.hpp
+++ b/torch/csrc/distributed/c10d/Backend.hpp
@@ -11,6 +11,7 @@
 #include <c10/macros/Macros.h>
 
 #include <c10d/ProcessGroup.hpp>
+#include <c10d/Work.hpp>
 #include <c10d/Types.hpp>
 #include <c10d/Utils.hpp>
 #include <c10d/debug.h>
@@ -48,7 +49,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
     TORCH_INTERNAL_ASSERT(false, "getBackendName is not implemented.");
   };
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> broadcast(
+  virtual c10::intrusive_ptr<Work> broadcast(
       std::vector<at::Tensor>& /* tensors */,
       const BroadcastOptions& /* opts */ = BroadcastOptions()) {
     TORCH_CHECK(
@@ -56,7 +57,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
         c10::str("Backend ", getBackendName(), "does not support broadcast"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> allreduce(
+  virtual c10::intrusive_ptr<Work> allreduce(
       std::vector<at::Tensor>& /* tensors */,
       const AllreduceOptions& /* opts */ = AllreduceOptions()) {
     TORCH_CHECK(
@@ -64,7 +65,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
         c10::str("Backend ", getBackendName(), "does not support allreduce"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> allreduce_coalesced(
+  virtual c10::intrusive_ptr<Work> allreduce_coalesced(
       std::vector<at::Tensor>& /* tensors */,
       const AllreduceCoalescedOptions& /* opts */ =
           AllreduceCoalescedOptions()) {
@@ -76,7 +77,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
             "does not support allreduce_coalesced"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> reduce(
+  virtual c10::intrusive_ptr<Work> reduce(
       std::vector<at::Tensor>& /* tensors */,
       const ReduceOptions& /* opts */ = ReduceOptions()) {
     TORCH_CHECK(
@@ -84,7 +85,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
         c10::str("Backend ", getBackendName(), "does not support reduce"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> allgather(
+  virtual c10::intrusive_ptr<Work> allgather(
       std::vector<std::vector<at::Tensor>>& /* outputTensors */,
       std::vector<at::Tensor>& /* inputTensors */,
       const AllgatherOptions& /* opts */ = AllgatherOptions()) {
@@ -97,7 +98,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
   // is interpreted as a contigious collection of size inputBuffer * WORLD_SIZE.
   // For implementers of ProcessGroup API and advanced users only.
   // Note: this function will be deprecated in near future.
-  virtual c10::intrusive_ptr<ProcessGroup::Work> _allgather_base(
+  virtual c10::intrusive_ptr<Work> _allgather_base(
       at::Tensor& /* outputBuffer */,
       at::Tensor& /* inputBuffer */,
       const AllgatherOptions& /* opts */ = AllgatherOptions()) {
@@ -111,7 +112,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
   // * do not add dependencies on this function,
   // * do not implement it in your Backend, implement _allgather_base
   //   instead.
-  virtual c10::intrusive_ptr<ProcessGroup::Work> allgather_coalesced(
+  virtual c10::intrusive_ptr<Work> allgather_coalesced(
       std::vector<std::vector<at::Tensor>>& /* outputTensorLists */,
       std::vector<at::Tensor>& /* inputTensors */,
       const AllgatherOptions& /* opts */ = AllgatherOptions()) {
@@ -123,7 +124,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
             "does not support allgather_coalesced"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> gather(
+  virtual c10::intrusive_ptr<Work> gather(
       std::vector<std::vector<at::Tensor>>& /* outputTensors */,
       std::vector<at::Tensor>& /* inputTensors */,
       const GatherOptions& /* opts */ = GatherOptions()) {
@@ -132,7 +133,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
         c10::str("Backend ", getBackendName(), "does not support gather"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> scatter(
+  virtual c10::intrusive_ptr<Work> scatter(
       std::vector<at::Tensor>& /* outputTensors */,
       std::vector<std::vector<at::Tensor>>& /* inputTensors */,
       const ScatterOptions& /* opts */ = ScatterOptions()) {
@@ -141,7 +142,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
         c10::str("Backend ", getBackendName(), "does not support scatter"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(
+  virtual c10::intrusive_ptr<Work> reduce_scatter(
       std::vector<at::Tensor>& /* outputTensors */,
       std::vector<std::vector<at::Tensor>>& /* inputTensors */,
       const ReduceScatterOptions& /* opts */ = ReduceScatterOptions()) {
@@ -151,7 +152,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
             "Backend ", getBackendName(), "does not support reduce_scatter"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> _reduce_scatter_base(
+  virtual c10::intrusive_ptr<Work> _reduce_scatter_base(
       at::Tensor& /* outputBuffer */,
       at::Tensor& /* inputBuffer */,
       const ReduceScatterOptions& /* opts */ = ReduceScatterOptions()) {
@@ -163,7 +164,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
             "does not support _reduce_scatter_base"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> alltoall_base(
+  virtual c10::intrusive_ptr<Work> alltoall_base(
       at::Tensor& /* outputBuffer */,
       at::Tensor& /* inputBuffer */,
       std::vector<int64_t>& /* outputSplitSizes */,
@@ -175,7 +176,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
             "Backend ", getBackendName(), "does not support alltoall_base"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> alltoall(
+  virtual c10::intrusive_ptr<Work> alltoall(
       std::vector<at::Tensor>& /* outputTensors */,
       std::vector<at::Tensor>& /* inputTensors */,
       const AllToAllOptions& opts = AllToAllOptions()) {
@@ -222,7 +223,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
             " does not yet support sequence numbers."));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> send(
+  virtual c10::intrusive_ptr<Work> send(
       std::vector<at::Tensor>& /* tensors */,
       int /* dstRank */,
       int /* tag */) {
@@ -230,7 +231,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
         false, c10::str("Backend ", getBackendName(), "does not support send"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> recv(
+  virtual c10::intrusive_ptr<Work> recv(
       std::vector<at::Tensor>& /* tensors */,
       int /* srcRank */,
       int /* tag */) {
@@ -238,7 +239,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
         false, c10::str("Backend ", getBackendName(), "does not support recv"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> recvAnysource(
+  virtual c10::intrusive_ptr<Work> recvAnysource(
       std::vector<at::Tensor>& /* tensors */,
       int /* tag */) {
     TORCH_CHECK(
@@ -247,7 +248,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
             "Backend ", getBackendName(), "does not support recvAnysource"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> barrier(
+  virtual c10::intrusive_ptr<Work> barrier(
       const BarrierOptions& /* opts */ = BarrierOptions()) {
     TORCH_CHECK(
         false,
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index a8f323038128..36034114d751 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -15,30 +15,49 @@
 namespace {
 // Provides additional detail into NCCL error codes based on when these are
 // thrown in the NCCL codebase.
-const inline char* getNcclErrorDetailStr(ncclResult_t error, c10::optional<std::string> processGroupFailureReason = c10::nullopt) {
+std::string getNcclErrorDetailStr(ncclResult_t error, c10::optional<std::string> processGroupFailureReason = c10::nullopt) {
   // Prioritize failure reason provided by PG NCCL first, as it can abort
   // communicators when it encounters collective timeouts, etc.
   if (processGroupFailureReason != c10::nullopt) {
     return (*processGroupFailureReason).c_str();
   }
+  std::string interpret;
+  std::string err;
+#ifdef ENABLE_NCCL_GET_LAST_ERROR
+  err = "\nLast error:\n" + std::string(ncclGetLastError(NULL));
+#endif
   switch (error) {
     case ncclUnhandledCudaError:
-      return "ncclUnhandledCudaError: Call to CUDA function failed.";
+      interpret = "ncclUnhandledCudaError: Call to CUDA function failed.";
+      break;
     case ncclSystemError:
-      return "ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. "
-        "It can be also caused by unexpected exit of a remote peer, you can check NCCL warnings for failure reason and see if there is connection closure by a peer.";
+      interpret = "ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error. "
+        "It can be also caused by unexpected exit of a remote peer.";
+      break;
     case ncclInternalError:
-      return "ncclInternalError: Internal check failed. This is either a bug in NCCL or due to memory corruption";
+      interpret = "ncclInternalError: Internal check failed.";
+      break;
     case ncclInvalidArgument:
-      return "ncclInvalidArgument: Invalid value for an argument (such as invalid pointer, device count, ip:host pair, etc).";
+      interpret = "ncclInvalidArgument: Invalid value for an argument.";
+      break;
     case ncclInvalidUsage:
-      return "ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).";
-    default:
+      interpret = "ncclInvalidUsage: This usually reflects invalid usage of NCCL library.";
       break;
+    default:
+      interpret = "Unknown NCCL error!";
   }
-  return "Unknown NCCL error";
+  return interpret + err;
 }
 } // namespace
+
+// ncclGetLastError() is enabled only for NCCL versions 2.13+
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
+    (NCCL_MINOR >= 13)
+#define ENABLE_NCCL_GET_LAST_ERROR
+#elif defined(NCCL_MAJOR) && (NCCL_MAJOR >= 3)
+#define ENABLE_NCCL_GET_LAST_ERROR
+#endif
+
 // Error checking is enabled only for NCCL versions 2.4+ since ncclCommAbort()
 // and ncclCommGetAsyncError() are not supported in earlier versions.
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index 52d9d0ba2efb..109265d1a719 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -6,8 +6,7 @@
 
 namespace c10d {
 namespace {
-std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<ProcessGroup::Work>>
-broadcast_(
+std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>> broadcast_(
     at::TensorList tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     int64_t root_rank,
@@ -19,13 +18,11 @@ broadcast_(
       BroadcastOptions{
           root_rank, root_tensor, std::chrono::milliseconds(timeout)});
 
-  return std::
-      tuple<std::vector<at::Tensor>, c10::intrusive_ptr<ProcessGroup::Work>>(
-          std::move(tensor_vec), work);
+  return std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+      std::move(tensor_vec), work);
 }
 
-std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<ProcessGroup::Work>>
-allreduce_(
+std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>> allreduce_(
     at::TensorList tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const c10::intrusive_ptr<ReduceOp>& reduce_op,
@@ -38,14 +35,11 @@ allreduce_(
   // Return input tensors as output tensors to make inplace allreduce look like
   // a functional API, so that make_fx can correctly build the dependencies in
   // the graph later.
-  return std::
-      tuple<std::vector<at::Tensor>, c10::intrusive_ptr<ProcessGroup::Work>>(
-          std::move(tensor_vec), work);
+  return std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+      std::move(tensor_vec), work);
 }
 
-std::tuple<
-    std::vector<std::vector<at::Tensor>>,
-    c10::intrusive_ptr<ProcessGroup::Work>>
+std::tuple<std::vector<std::vector<at::Tensor>>, c10::intrusive_ptr<Work>>
 allgather_(
     const std::vector<std::vector<at::Tensor>>& output_tensors,
     const std::vector<at::Tensor>& input_tensors,
@@ -58,13 +52,12 @@ allgather_(
 
   // Copy output tensors (not storage) so that this can be used in a functional
   // manner
-  return std::tuple<
-      std::vector<std::vector<at::Tensor>>,
-      c10::intrusive_ptr<ProcessGroup::Work>>(output_tensors, work);
+  return std::
+      tuple<std::vector<std::vector<at::Tensor>>, c10::intrusive_ptr<Work>>(
+          output_tensors, work);
 }
 
-std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<ProcessGroup::Work>>
-reduce_scatter_(
+std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>> reduce_scatter_(
     const std::vector<at::Tensor>& output_tensors,
     const std::vector<std::vector<at::Tensor>>& input_tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
@@ -76,12 +69,11 @@ reduce_scatter_(
       ReduceScatterOptions{
           *reduce_op.get(), std::chrono::milliseconds(timeout)});
 
-  return std::
-      tuple<std::vector<at::Tensor>, c10::intrusive_ptr<ProcessGroup::Work>>(
-          output_tensors, work);
+  return std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+      output_tensors, work);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> reduce_(
+c10::intrusive_ptr<Work> reduce_(
     at::TensorList tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const c10::intrusive_ptr<ReduceOp>& reduce_op,
@@ -98,7 +90,7 @@ c10::intrusive_ptr<ProcessGroup::Work> reduce_(
           std::chrono::milliseconds(timeout)});
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> gather_(
+c10::intrusive_ptr<Work> gather_(
     const std::vector<std::vector<at::Tensor>>& output_tensors,
     const std::vector<at::Tensor>& input_tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
@@ -110,8 +102,7 @@ c10::intrusive_ptr<ProcessGroup::Work> gather_(
       GatherOptions{root_rank, std::chrono::milliseconds(timeout)});
 }
 
-std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<ProcessGroup::Work>>
-scatter_(
+std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>> scatter_(
     const std::vector<at::Tensor>& output_tensors,
     const std::vector<std::vector<at::Tensor>>& input_tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
@@ -122,12 +113,11 @@ scatter_(
       const_cast<std::vector<std::vector<at::Tensor>>&>(input_tensors),
       ScatterOptions{root_rank, std::chrono::milliseconds(timeout)});
 
-  return std::
-      tuple<std::vector<at::Tensor>, c10::intrusive_ptr<ProcessGroup::Work>>(
-          output_tensors, work);
+  return std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+      output_tensors, work);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> alltoall_(
+c10::intrusive_ptr<Work> alltoall_(
     at::TensorList output_tensors,
     at::TensorList input_tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
@@ -140,7 +130,7 @@ c10::intrusive_ptr<ProcessGroup::Work> alltoall_(
       AllToAllOptions{std::chrono::milliseconds(timeout)});
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> barrier(
+c10::intrusive_ptr<Work> barrier(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<int64_t>& device_ids,
     int64_t timeout) {
@@ -148,7 +138,7 @@ c10::intrusive_ptr<ProcessGroup::Work> barrier(
       BarrierOptions{device_ids, std::chrono::milliseconds(timeout)});
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> send(
+c10::intrusive_ptr<Work> send(
     at::TensorList tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     int64_t dstRank,
@@ -158,7 +148,7 @@ c10::intrusive_ptr<ProcessGroup::Work> send(
       tensor_vec, static_cast<int>(dstRank), static_cast<int>(tag));
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> recv_(
+c10::intrusive_ptr<Work> recv_(
     at::TensorList tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     int64_t srcRank,
@@ -173,11 +163,9 @@ TORCH_LIBRARY(c10d, m) {
   // declarations. They don't expose the details of the two classes into
   // TorchScript.
   m.class_<ProcessGroup>("ProcessGroup").def(torch::init<int64_t, int64_t>());
-  m.class_<ProcessGroup::Work>("Work")
+  m.class_<Work>("Work")
       .def(torch::init<>())
-      .def("wait", [](const c10::intrusive_ptr<ProcessGroup::Work>& self) {
-        self->wait();
-      });
+      .def("wait", [](const c10::intrusive_ptr<Work>& self) { self->wait(); });
   m.class_<ReduceOp>("ReduceOp").def(torch::init<>());
   // It's important to register the op to the CompositeExplicitAutograd key to
   // enable
@@ -216,20 +204,19 @@ TORCH_LIBRARY(c10d, m) {
 
 namespace ops {
 
-c10::intrusive_ptr<ProcessGroup::Work> broadcast(
+c10::intrusive_ptr<Work> broadcast(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     at::TensorList tensors,
     const BroadcastOptions& opts) {
-  static auto op = c10::Dispatcher::singleton()
-                       .findSchemaOrThrow("c10d::broadcast_", "")
-                       .typed<std::tuple<
-                           std::vector<at::Tensor>,
-                           c10::intrusive_ptr<ProcessGroup::Work>>(
-                           at::TensorList,
-                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
-                           int64_t,
-                           int64_t,
-                           int64_t)>();
+  static auto op =
+      c10::Dispatcher::singleton()
+          .findSchemaOrThrow("c10d::broadcast_", "")
+          .typed<std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+              at::TensorList,
+              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+              int64_t,
+              int64_t,
+              int64_t)>();
   // It's awakward to unbox the opts here and box them again in the custom C++
   // op. But it's also complicated to make opts as a CustomClassHolder. Leave it
   // as it is now.
@@ -241,19 +228,18 @@ c10::intrusive_ptr<ProcessGroup::Work> broadcast(
       opts.timeout.count()));
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> allreduce(
+c10::intrusive_ptr<Work> allreduce(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     at::TensorList tensors,
     const AllreduceOptions& opts) {
-  static auto op = c10::Dispatcher::singleton()
-                       .findSchemaOrThrow("c10d::allreduce_", "")
-                       .typed<std::tuple<
-                           std::vector<at::Tensor>,
-                           c10::intrusive_ptr<ProcessGroup::Work>>(
-                           at::TensorList,
-                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
-                           const c10::intrusive_ptr<::c10d::ReduceOp>&,
-                           int64_t)>();
+  static auto op =
+      c10::Dispatcher::singleton()
+          .findSchemaOrThrow("c10d::allreduce_", "")
+          .typed<std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+              at::TensorList,
+              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+              const c10::intrusive_ptr<::c10d::ReduceOp>&,
+              int64_t)>();
 
   return std::get<1>(op.call(
       tensors,
@@ -262,7 +248,7 @@ c10::intrusive_ptr<ProcessGroup::Work> allreduce(
       opts.timeout.count()));
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> allgather(
+c10::intrusive_ptr<Work> allgather(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<std::vector<at::Tensor>>& output_tensors,
     const std::vector<at::Tensor>& input_tensors,
@@ -271,7 +257,7 @@ c10::intrusive_ptr<ProcessGroup::Work> allgather(
                        .findSchemaOrThrow("c10d::allgather_", "")
                        .typed<std::tuple<
                            std::vector<std::vector<at::Tensor>>,
-                           c10::intrusive_ptr<ProcessGroup::Work>>(
+                           c10::intrusive_ptr<Work>>(
                            const std::vector<std::vector<at::Tensor>>&,
                            const std::vector<at::Tensor>&,
                            const c10::intrusive_ptr<::c10d::ProcessGroup>&,
@@ -280,21 +266,20 @@ c10::intrusive_ptr<ProcessGroup::Work> allgather(
       output_tensors, input_tensors, process_group, opts.timeout.count()));
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(
+c10::intrusive_ptr<Work> reduce_scatter(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<at::Tensor>& output_tensors,
     const std::vector<std::vector<at::Tensor>>& input_tensors,
     const ReduceScatterOptions& opts) {
-  static auto op = c10::Dispatcher::singleton()
-                       .findSchemaOrThrow("c10d::reduce_scatter_", "")
-                       .typed<std::tuple<
-                           std::vector<at::Tensor>,
-                           c10::intrusive_ptr<ProcessGroup::Work>>(
-                           const std::vector<at::Tensor>&,
-                           const std::vector<std::vector<at::Tensor>>&,
-                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
-                           const c10::intrusive_ptr<::c10d::ReduceOp>&,
-                           int64_t)>();
+  static auto op =
+      c10::Dispatcher::singleton()
+          .findSchemaOrThrow("c10d::reduce_scatter_", "")
+          .typed<std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+              const std::vector<at::Tensor>&,
+              const std::vector<std::vector<at::Tensor>>&,
+              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+              const c10::intrusive_ptr<::c10d::ReduceOp>&,
+              int64_t)>();
   return std::get<1>(op.call(
       output_tensors,
       input_tensors,
@@ -303,13 +288,13 @@ c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(
       opts.timeout.count()));
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> reduce(
+c10::intrusive_ptr<Work> reduce(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     at::TensorList tensors,
     const ReduceOptions& opts) {
   static auto op = c10::Dispatcher::singleton()
                        .findSchemaOrThrow("c10d::reduce_", "")
-                       .typed<c10::intrusive_ptr<::c10d::ProcessGroup::Work>(
+                       .typed<c10::intrusive_ptr<::c10d::Work>(
                            at::TensorList,
                            const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                            const c10::intrusive_ptr<::c10d::ReduceOp>&,
@@ -325,14 +310,14 @@ c10::intrusive_ptr<ProcessGroup::Work> reduce(
       opts.timeout.count());
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> gather(
+c10::intrusive_ptr<Work> gather(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<std::vector<at::Tensor>>& output_tensors,
     const std::vector<at::Tensor>& input_tensors,
     const GatherOptions& opts) {
   static auto op = c10::Dispatcher::singleton()
                        .findSchemaOrThrow("c10d::gather_", "")
-                       .typed<c10::intrusive_ptr<::c10d::ProcessGroup::Work>(
+                       .typed<c10::intrusive_ptr<::c10d::Work>(
                            const std::vector<std::vector<at::Tensor>>&,
                            const std::vector<at::Tensor>&,
                            const c10::intrusive_ptr<::c10d::ProcessGroup>&,
@@ -346,21 +331,20 @@ c10::intrusive_ptr<ProcessGroup::Work> gather(
       opts.timeout.count());
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> scatter(
+c10::intrusive_ptr<Work> scatter(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<at::Tensor>& output_tensors,
     const std::vector<std::vector<at::Tensor>>& input_tensors,
     const ScatterOptions& opts) {
-  static auto op = c10::Dispatcher::singleton()
-                       .findSchemaOrThrow("c10d::scatter_", "")
-                       .typed<std::tuple<
-                           std::vector<at::Tensor>,
-                           c10::intrusive_ptr<ProcessGroup::Work>>(
-                           const std::vector<at::Tensor>&,
-                           const std::vector<std::vector<at::Tensor>>&,
-                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
-                           int64_t,
-                           int64_t)>();
+  static auto op =
+      c10::Dispatcher::singleton()
+          .findSchemaOrThrow("c10d::scatter_", "")
+          .typed<std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+              const std::vector<at::Tensor>&,
+              const std::vector<std::vector<at::Tensor>>&,
+              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+              int64_t,
+              int64_t)>();
   return std::get<1>(op.call(
       output_tensors,
       input_tensors,
@@ -369,14 +353,14 @@ c10::intrusive_ptr<ProcessGroup::Work> scatter(
       opts.timeout.count()));
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> alltoall(
+c10::intrusive_ptr<Work> alltoall(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     at::TensorList output_tensors,
     at::TensorList input_tensors,
     const AllToAllOptions& opts) {
   static auto op = c10::Dispatcher::singleton()
                        .findSchemaOrThrow("c10d::alltoall_", "")
-                       .typed<c10::intrusive_ptr<::c10d::ProcessGroup::Work>(
+                       .typed<c10::intrusive_ptr<::c10d::Work>(
                            at::TensorList,
                            at::TensorList,
                            const c10::intrusive_ptr<::c10d::ProcessGroup>&,
@@ -385,26 +369,26 @@ c10::intrusive_ptr<ProcessGroup::Work> alltoall(
       output_tensors, input_tensors, process_group, opts.timeout.count());
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> barrier(
+c10::intrusive_ptr<Work> barrier(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const BarrierOptions& opts) {
   static auto op = c10::Dispatcher::singleton()
                        .findSchemaOrThrow("c10d::barrier", "")
-                       .typed<c10::intrusive_ptr<::c10d::ProcessGroup::Work>(
+                       .typed<c10::intrusive_ptr<::c10d::Work>(
                            const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                            const std::vector<int64_t>&,
                            int64_t)>();
   return op.call(process_group, opts.device_ids, opts.timeout.count());
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> send(
+c10::intrusive_ptr<Work> send(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     at::TensorList tensors,
     int64_t dstRank,
     int64_t tag) {
   static auto op = c10::Dispatcher::singleton()
                        .findSchemaOrThrow("c10d::send", "")
-                       .typed<c10::intrusive_ptr<::c10d::ProcessGroup::Work>(
+                       .typed<c10::intrusive_ptr<::c10d::Work>(
                            at::TensorList,
                            const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                            int64_t,
@@ -412,14 +396,14 @@ c10::intrusive_ptr<ProcessGroup::Work> send(
   return op.call(tensors, process_group, dstRank, tag);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> recv(
+c10::intrusive_ptr<Work> recv(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     at::TensorList tensors,
     int64_t srcRank,
     int64_t tag) {
   static auto op = c10::Dispatcher::singleton()
                        .findSchemaOrThrow("c10d::recv_", "")
-                       .typed<c10::intrusive_ptr<::c10d::ProcessGroup::Work>(
+                       .typed<c10::intrusive_ptr<::c10d::Work>(
                            at::TensorList,
                            const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                            int64_t,
diff --git a/torch/csrc/distributed/c10d/Ops.hpp b/torch/csrc/distributed/c10d/Ops.hpp
index 1d2b7b343c0f..c65047c4224f 100644
--- a/torch/csrc/distributed/c10d/Ops.hpp
+++ b/torch/csrc/distributed/c10d/Ops.hpp
@@ -11,62 +11,62 @@ namespace ops {
 // const std::vector<at::Tensor>&. However, const std::vector<at::Tensor>& is
 // used whenever the API accepts std::vector<std::vector<at::Tensor>>& to keep
 // consistency.
-TORCH_API c10::intrusive_ptr<ProcessGroup::Work> broadcast(
+TORCH_API c10::intrusive_ptr<Work> broadcast(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     at::TensorList tensors,
     const BroadcastOptions& opts = {});
 
-TORCH_API c10::intrusive_ptr<ProcessGroup::Work> allreduce(
+TORCH_API c10::intrusive_ptr<Work> allreduce(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     at::TensorList tensors,
     const AllreduceOptions& opts = {});
 
-TORCH_API c10::intrusive_ptr<ProcessGroup::Work> allgather(
+TORCH_API c10::intrusive_ptr<Work> allgather(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<std::vector<at::Tensor>>& output_tensors,
     const std::vector<at::Tensor>& input_tensors,
     const AllgatherOptions& opts = {});
 
-TORCH_API c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(
+TORCH_API c10::intrusive_ptr<Work> reduce_scatter(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<at::Tensor>& output_tensors,
     const std::vector<std::vector<at::Tensor>>& input_tensors,
     const ReduceScatterOptions& opts = {});
 
-TORCH_API c10::intrusive_ptr<ProcessGroup::Work> reduce(
+TORCH_API c10::intrusive_ptr<Work> reduce(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     at::TensorList tensors,
     const ReduceOptions& opts = {});
 
-TORCH_API c10::intrusive_ptr<ProcessGroup::Work> gather(
+TORCH_API c10::intrusive_ptr<Work> gather(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<std::vector<at::Tensor>>& output_tensors,
     const std::vector<at::Tensor>& input_tensors,
     const GatherOptions& opts = {});
 
-TORCH_API c10::intrusive_ptr<ProcessGroup::Work> scatter(
+TORCH_API c10::intrusive_ptr<Work> scatter(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<at::Tensor>& output_tensors,
     const std::vector<std::vector<at::Tensor>>& input_tensors,
     const ScatterOptions& opts = {});
 
-TORCH_API c10::intrusive_ptr<ProcessGroup::Work> alltoall(
+TORCH_API c10::intrusive_ptr<Work> alltoall(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     at::TensorList output_tensors,
     at::TensorList input_tensors,
     const AllToAllOptions& opts = {});
 
-TORCH_API c10::intrusive_ptr<ProcessGroup::Work> barrier(
+TORCH_API c10::intrusive_ptr<Work> barrier(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const BarrierOptions& opts = {});
 
-TORCH_API c10::intrusive_ptr<ProcessGroup::Work> send(
+TORCH_API c10::intrusive_ptr<Work> send(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     at::TensorList tensors,
     int64_t dstRank,
     int64_t tag);
 
-TORCH_API c10::intrusive_ptr<ProcessGroup::Work> recv(
+TORCH_API c10::intrusive_ptr<Work> recv(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     at::TensorList tensors,
     int64_t srcRank,
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.cpp b/torch/csrc/distributed/c10d/ProcessGroup.cpp
index fde76d9f5039..d33b35e4e3df 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.cpp
@@ -57,128 +57,6 @@ bool isP2POp(OpType opType, bool batchP2P /*= false*/) {
       opType == OpType::RECVANYSOURCE;
 }
 
-ProcessGroup::Work::Work(
-    int rank,
-    OpType opType,
-    const char* profilingTitle,
-    const c10::optional<std::vector<at::Tensor>>& inputTensors)
-    : rank_(rank), opType_(opType) {
-  if (profilingTitle != nullptr) {
-    auto recordingFunction =
-        std::make_shared<at::RecordFunction>(at::RecordScope::USER_SCOPE);
-    if (recordingFunction->isActive()) {
-      // Work events follow a future like pattern and can potentially be marked
-      // as complete by different threads, so explicitly set as async event.
-      recordingFunction->_setAsync();
-      // Passing input tensor to recordFunction allows for shape information in
-      // profiling output.
-      std::vector<c10::IValue> inputs;
-      if (inputTensors) {
-        inputs.reserve(inputTensors->size());
-        for (const auto& tensor : *inputTensors) {
-          inputs.emplace_back(tensor);
-        }
-      }
-      recordingFunction->before(
-          profilingTitle,
-          c10::ArrayRef<const c10::IValue>(inputs.data(), inputs.size()));
-      std::function<void()> end_handler = [recordingFunction]() {
-        recordingFunction->end();
-      };
-      recordFunctionEndCallback_ = at::wrapPropagateTLSState(end_handler);
-    }
-  }
-}
-
-OpType ProcessGroup::Work::retrieveOpType() {
-  return opType_;
-}
-
-ProcessGroup::Work::~Work() = default;
-
-bool ProcessGroup::Work::isCompleted() {
-  std::lock_guard<std::mutex> lock(mutex_);
-  return completed_;
-}
-
-bool ProcessGroup::Work::isSuccess() const {
-  std::lock_guard<std::mutex> lock(mutex_);
-  return !exception_;
-}
-
-std::exception_ptr ProcessGroup::Work::exception() const {
-  std::lock_guard<std::mutex> lock(mutex_);
-  return exception_;
-}
-
-int ProcessGroup::Work::sourceRank() const {
-  TORCH_CHECK(
-      false,
-      "sourceRank() may only be called on work objects "
-      "that correspond to a recv or recv-from-any call.");
-}
-
-std::vector<at::Tensor> ProcessGroup::Work::result() {
-  TORCH_CHECK(false, "result() not implemented.");
-}
-
-void ProcessGroup::Work::synchronize() {}
-
-bool ProcessGroup::Work::wait(std::chrono::milliseconds timeout) {
-  std::unique_lock<std::mutex> lock(mutex_);
-  if (timeout == kNoTimeout) {
-    // This waits without a timeout.
-    cv_.wait(lock, [&] { return completed_; });
-  } else {
-    // Waits for the user-provided timeout.
-    cv_.wait_for(lock, timeout, [&] { return completed_; });
-    if (!completed_) {
-      // Throw exception if the wait operation timed out and the work was not
-      // completed.
-      TORCH_CHECK(false, "Operation timed out!");
-    }
-  }
-  if (exception_) {
-    std::rethrow_exception(exception_);
-  }
-  synchronize();
-  // Always return true, because abort API is not implemented.
-  return true;
-}
-
-void ProcessGroup::Work::abort() {
-  TORCH_CHECK(false, "ProcessGroup::Work::abort not implemented.");
-}
-
-c10::intrusive_ptr<c10::ivalue::Future> ProcessGroup::Work::getFuture() {
-  TORCH_CHECK(false, "ProcessGroup::Work::getFuture not implemented.")
-}
-
-void ProcessGroup::Work::finish(std::exception_ptr exception) {
-  std::unique_lock<std::mutex> lock(mutex_);
-  completed_ = true;
-  exception_ = exception;
-  if (recordFunctionEndCallback_) {
-    recordFunctionEndCallback_();
-    recordFunctionEndCallback_ = nullptr;
-  }
-  lock.unlock();
-  cv_.notify_all();
-}
-
-void ProcessGroup::Work::finishAndThrow(std::exception_ptr exception) {
-  std::unique_lock<std::mutex> lock(mutex_);
-  completed_ = true;
-  exception_ = exception;
-  if (recordFunctionEndCallback_) {
-    recordFunctionEndCallback_();
-    recordFunctionEndCallback_ = nullptr;
-  }
-  if (exception_) {
-    std::rethrow_exception(exception_);
-  }
-}
-
 ProcessGroup::ProcessGroup(int rank, int size)
     : rank_(rank), size_(size), dist_debug_level_(debug_level()) {
   C10_LOG_API_USAGE_ONCE("c10d.process_group");
@@ -190,57 +68,4 @@ void ProcessGroup::init() {
   C10_LOG_API_USAGE_ONCE(
       fmt::format("c10d.process_group_{}", getBackendName()));
 }
-
-class FutureWrappingWork : public ProcessGroup::Work {
- public:
-  FutureWrappingWork(c10::intrusive_ptr<c10::ivalue::Future> fut)
-      : Work(), _fut(fut) {}
-
-  ~FutureWrappingWork() {}
-
-  bool isCompleted() override {
-    return _fut->completed();
-  }
-
-  bool isSuccess() const override {
-    return _fut->hasValue();
-  }
-
-  std::exception_ptr exception() const override {
-    return _fut->exception_ptr();
-  }
-
-  int sourceRank() const override {
-    TORCH_CHECK(false, "FutureWrappingWork::sourceRank() not implemented");
-  }
-
-  std::vector<at::Tensor> result() override {
-    return _fut->value().toPyObjectHolder()->extractTensors();
-  }
-
-  bool wait(std::chrono::milliseconds timeout) override {
-    // FIXME
-    TORCH_CHECK(
-        timeout == kNoTimeout,
-        "FutureWrappingWork::wait() with finite timeout not implemented");
-    _fut->wait();
-    return true;
-  }
-
-  void abort() override {
-    TORCH_CHECK(false, "FutureWrappingWork::abort() not implemented");
-  }
-
-  c10::intrusive_ptr<c10::ivalue::Future> getFuture() override {
-    return _fut;
-  }
-
- private:
-  c10::intrusive_ptr<c10::ivalue::Future> _fut;
-};
-
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroup::Work::create_from_future(
-    c10::intrusive_ptr<c10::ivalue::Future> future) {
-  return c10::make_intrusive<FutureWrappingWork>(future);
-}
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
index a29275f15098..43a9add5fa7f 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -12,6 +12,7 @@
 
 #include <c10d/Types.hpp>
 #include <c10d/Utils.hpp>
+#include <c10d/Work.hpp>
 #include <c10d/debug.h>
 #include <c10d/sequence_num.hpp>
 
@@ -22,41 +23,11 @@
 // SEE RFC: https://github.com/pytorch/pytorch/issues/39662
 // *************************************************************************
 
-constexpr auto kNoTimeout = std::chrono::milliseconds(0);
 constexpr auto kProcessGroupDefaultTimeout =
     std::chrono::milliseconds(30 * 60 * 1000);
 
 namespace c10d {
 
-constexpr const char* const kSeqNumStoreKey = "SEQ_NUM_STORE_KEY";
-
-enum class OpType : std::uint8_t {
-  BROADCAST = 0,
-  ALLREDUCE = 1,
-  ALLREDUCE_COALESCED = 2,
-  REDUCE = 3,
-  ALLGATHER = 4,
-  _ALLGATHER_BASE = 5,
-  ALLGATHER_COALESCED = 6,
-  GATHER = 7,
-  SCATTER = 8,
-  REDUCE_SCATTER = 9,
-  ALLTOALL_BASE = 10,
-  ALLTOALL = 11,
-  SEND = 12,
-  RECV = 13,
-  RECVANYSOURCE = 14,
-  BARRIER = 15,
-  _REDUCE_SCATTER_BASE = 16,
-  UNKNOWN = 100,
-};
-
-// Converts OpType to human readable string.
-TORCH_API std::string opTypeToString(OpType opType);
-
-// Whether or not an OP is an p2p op (SEND, RECV, RECVANYSOURCE)
-TORCH_API bool isP2POp(OpType opType, bool batchP2P = false);
-
 // ProcessGroup is a base class that captures collective and point to
 // point communication in a fixed set of processes.
 //
@@ -79,103 +50,6 @@ TORCH_API bool isP2POp(OpType opType, bool batchP2P = false);
 //
 class TORCH_API ProcessGroup : public torch::CustomClassHolder {
  public:
-  // Please do not use ProcessGroup::Work API, it is going away, to be
-  // replaced by ivalue::Future.
-  // Python binding for this class might change, please do not assume
-  // this will be bound using pybind.
-  class TORCH_API Work : public torch::CustomClassHolder {
-   public:
-    Work(
-        int rank = -1,
-        OpType opType = OpType::UNKNOWN,
-        const char* profilingTitle = nullptr,
-        const c10::optional<std::vector<at::Tensor>>& inputTensors =
-            c10::nullopt);
-
-    virtual ~Work();
-
-    // Checks if request has completed. Non-blocking operation.
-    virtual bool isCompleted();
-
-    // Returns if the work completed successfully.
-    // If false, the exception function can be called to get details.
-    virtual bool isSuccess() const;
-
-    // Returns exception if isSuccess() returned false.
-    virtual std::exception_ptr exception() const;
-
-    // Returns source rank if this objects represents a recv-from-any.
-    virtual int sourceRank() const;
-
-    // Returns result tensors, if applicable.
-    // If work is not supposed to have result, we return empty list.
-    virtual std::vector<at::Tensor> result();
-
-    // Ensures that operations on the output tensors that are invoked
-    // after this function returns are correctly sequenced after the
-    // asynchronous completion of this work.
-    //
-    // For CUDA tensors, it inserts stream synchronization such that
-    // the streams of the caller wait for completion of the
-    // asynchronous operations on the destination tensors.
-    //
-    // For CPU tensors, it is currently a nop.
-    //
-    // This function should only be used if the caller polls for
-    // completion through the `isCompleted` function, it has returned
-    // true, and the `isSuccess` function also has returned true.
-    //
-    virtual void synchronize();
-
-    // Waits until request completes. Blocking operation.
-    // Throws if the work completed with an exception.
-    // Returns false if the work is aborted.
-    // Otherwise, it always returns true, indicating the work is completed.
-    //
-    // Functionally equivalent to:
-    //
-    //   while (!isCompleted()) { /* nop */ }
-    //   auto success = isSuccess();
-    //   if (!success) { std::rethrow_exception(exception()); }
-    //   return success;
-    //
-    virtual bool wait(std::chrono::milliseconds timeout = kNoTimeout);
-
-    virtual void abort();
-
-    // Returns a Future object that will be associated with the completion of
-    // work. Only NCCL backend is currently supported.
-    virtual c10::intrusive_ptr<c10::ivalue::Future> getFuture();
-
-    OpType retrieveOpType();
-
-    static c10::intrusive_ptr<Work> create_from_future(c10::intrusive_ptr<c10::ivalue::Future>);
-
-   protected:
-    // Completes the work object and optionally sets the exception in a
-    // thread-safe manner. Notifies all waiting condition variables as well.
-    void finish(std::exception_ptr exception = nullptr);
-
-    // Similar to finish, but throws an exception if one is already set or
-    // provided by the user.
-    void finishAndThrow(std::exception_ptr exception);
-
-    mutable std::mutex mutex_;
-    std::condition_variable cv_;
-    bool completed_ = false;
-    std::exception_ptr exception_;
-
-    // Current rank of the node.
-    const int rank_;
-
-    // Operation type that this work object refers to.
-    OpType opType_;
-
-    // When profiling, the callback to record end of operation event. This
-    // callback needs to be called when collective operation is complete.
-    std::function<void()> recordFunctionEndCallback_;
-  };
-
   // ProcessGroup Options is a base struct that defines the basic options
   // when constructing a ProcessGroup. Each ProcessGroup subclass should
   // extend this struct and define its options if it wants to provide more
@@ -214,43 +88,44 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
   }
 
   virtual void endCoalescing(
-      std::vector<c10::intrusive_ptr<ProcessGroup::Work>>& /* reqs */) {
+      std::vector<c10::intrusive_ptr<Work>>& /* reqs */) {
     // no-op for backends that have not implemented endCoalescing
   }
 
   // Consider using ops in Ops.hpp instead of the below, which route things
   // to the dispatcher.
   // TODO: Find a way to force the above rule programmatically.
-  virtual c10::intrusive_ptr<ProcessGroup::Work> broadcast(
+  virtual c10::intrusive_ptr<Work> broadcast(
       std::vector<at::Tensor>& /* tensors */,
       const BroadcastOptions& /* opts */ = BroadcastOptions()) {
     TORCH_CHECK(
         false,
         c10::str(
-            "ProcessGroup ", getBackendName(), "does not support broadcast"));
+            "ProcessGroup ", getBackendName(), " does not support broadcast"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> allreduce(
+  virtual c10::intrusive_ptr<Work> allreduce(
       std::vector<at::Tensor>& /* tensors */,
       const AllreduceOptions& /* opts */ = AllreduceOptions()) {
     TORCH_CHECK(
         false,
         c10::str(
-            "ProcessGroup ", getBackendName(), "does not support allreduce"));
+            "ProcessGroup ", getBackendName(), " does not support allreduce"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> allreduce_coalesced(
+  virtual c10::intrusive_ptr<Work> allreduce_coalesced(
       std::vector<at::Tensor>& /* tensors */,
-      const AllreduceCoalescedOptions& /* opts */ = AllreduceCoalescedOptions()) {
+      const AllreduceCoalescedOptions& /* opts */ =
+          AllreduceCoalescedOptions()) {
     TORCH_CHECK(
         false,
         c10::str(
             "ProcessGroup ",
             getBackendName(),
-            "does not support allreduce_coalesced"));
+            " does not support allreduce_coalesced"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> reduce(
+  virtual c10::intrusive_ptr<Work> reduce(
       std::vector<at::Tensor>& /* tensors */,
       const ReduceOptions& /* opts */ = ReduceOptions()) {
     TORCH_CHECK(
@@ -258,21 +133,21 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
         c10::str("ProcessGroup ", getBackendName(), "does not support reduce"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> allgather(
+  virtual c10::intrusive_ptr<Work> allgather(
       std::vector<std::vector<at::Tensor>>& /* outputTensors */,
       std::vector<at::Tensor>& /* inputTensors */,
       const AllgatherOptions& /* opts */ = AllgatherOptions()) {
     TORCH_CHECK(
         false,
         c10::str(
-            "ProcessGroup ", getBackendName(), "does not support allgather"));
+            "ProcessGroup ", getBackendName(), " does not support allgather"));
   }
 
   // Gathers a single tensor inputBuffer into a single buffer outputBuffer that
   // is interpreted as a contigious collection of size inputBuffer * WORLD_SIZE.
   // For implementers of ProcessGroup API and advanced users only.
   // Note: this function will be deprecated in near future.
-  virtual c10::intrusive_ptr<ProcessGroup::Work> _allgather_base(
+  virtual c10::intrusive_ptr<Work> _allgather_base(
       at::Tensor& /* outputBuffer */,
       at::Tensor& /* inputBuffer */,
       const AllgatherOptions& /* opts */ = AllgatherOptions()) {
@@ -281,14 +156,14 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
         c10::str(
             "ProcessGroup ",
             getBackendName(),
-            "does not support _allgather_base"));
+            " does not support _allgather_base"));
   }
 
   // This function is deprecated and will be moved out of ProcessGroup to comms:
   // * do not add dependencies on this function,
   // * do not implement it in your ProcessGroup, implement _allgather_base
   //   instead.
-  virtual c10::intrusive_ptr<ProcessGroup::Work> allgather_coalesced(
+  virtual c10::intrusive_ptr<Work> allgather_coalesced(
       std::vector<std::vector<at::Tensor>>& /* outputTensorLists */,
       std::vector<at::Tensor>& /* inputTensors */,
       const AllgatherOptions& /* opts */ = AllgatherOptions()) {
@@ -297,29 +172,30 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
         c10::str(
             "ProcessGroup ",
             getBackendName(),
-            "does not support allgather_coalesced"));
+            " does not support allgather_coalesced"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> gather(
+  virtual c10::intrusive_ptr<Work> gather(
       std::vector<std::vector<at::Tensor>>& /* outputTensors */,
       std::vector<at::Tensor>& /* inputTensors */,
       const GatherOptions& /* opts */ = GatherOptions()) {
     TORCH_CHECK(
         false,
-        c10::str("ProcessGroup ", getBackendName(), "does not support gather"));
+        c10::str(
+            "ProcessGroup ", getBackendName(), " does not support gather"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> scatter(
+  virtual c10::intrusive_ptr<Work> scatter(
       std::vector<at::Tensor>& /* outputTensors */,
       std::vector<std::vector<at::Tensor>>& /* inputTensors */,
       const ScatterOptions& /* opts */ = ScatterOptions()) {
     TORCH_CHECK(
         false,
         c10::str(
-            "ProcessGroup ", getBackendName(), "does not support scatter"));
+            "ProcessGroup ", getBackendName(), " does not support scatter"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(
+  virtual c10::intrusive_ptr<Work> reduce_scatter(
       std::vector<at::Tensor>& /* outputTensors */,
       std::vector<std::vector<at::Tensor>>& /* inputTensors */,
       const ReduceScatterOptions& /* opts */ = ReduceScatterOptions()) {
@@ -328,10 +204,10 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
         c10::str(
             "ProcessGroup ",
             getBackendName(),
-            "does not support reduce_scatter"));
+            " does not support reduce_scatter"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> _reduce_scatter_base(
+  virtual c10::intrusive_ptr<Work> _reduce_scatter_base(
       at::Tensor& /* outputBuffer */,
       at::Tensor& /* inputBuffer */,
       const ReduceScatterOptions& /* opts */ = ReduceScatterOptions()) {
@@ -340,10 +216,10 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
         c10::str(
             "ProcessGroup ",
             getBackendName(),
-            "does not support _reduce_scatter_base"));
+            " does not support _reduce_scatter_base"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> alltoall_base(
+  virtual c10::intrusive_ptr<Work> alltoall_base(
       at::Tensor& /* outputBuffer */,
       at::Tensor& /* inputBuffer */,
       std::vector<int64_t>& /* outputSplitSizes */,
@@ -354,17 +230,17 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
         c10::str(
             "ProcessGroup ",
             getBackendName(),
-            "does not support alltoall_base"));
+            " does not support alltoall_base"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> alltoall(
+  virtual c10::intrusive_ptr<Work> alltoall(
       std::vector<at::Tensor>& /* outputTensors */,
       std::vector<at::Tensor>& /* inputTensors */,
       const AllToAllOptions& opts = AllToAllOptions()) {
     TORCH_CHECK(
         false,
         c10::str(
-            "ProcessGroup ", getBackendName(), "does not support alltoall"));
+            "ProcessGroup ", getBackendName(), " does not support alltoall"));
   }
 
   virtual void monitoredBarrier(
@@ -405,25 +281,25 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
             " does not yet support sequence numbers."));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> send(
+  virtual c10::intrusive_ptr<Work> send(
       std::vector<at::Tensor>& /* tensors */,
       int /* dstRank */,
       int /* tag */) {
     TORCH_CHECK(
         false,
-        c10::str("ProcessGroup ", getBackendName(), "does not support send"));
+        c10::str("ProcessGroup ", getBackendName(), " does not support send"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> recv(
+  virtual c10::intrusive_ptr<Work> recv(
       std::vector<at::Tensor>& /* tensors */,
       int /* srcRank */,
       int /* tag */) {
     TORCH_CHECK(
         false,
-        c10::str("ProcessGroup ", getBackendName(), "does not support recv"));
+        c10::str("ProcessGroup ", getBackendName(), " does not support recv"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> recvAnysource(
+  virtual c10::intrusive_ptr<Work> recvAnysource(
       std::vector<at::Tensor>& /* tensors */,
       int /* tag */) {
     TORCH_CHECK(
@@ -431,15 +307,15 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
         c10::str(
             "ProcessGroup ",
             getBackendName(),
-            "does not support recvAnysource"));
+            " does not support recvAnysource"));
   }
 
-  virtual c10::intrusive_ptr<ProcessGroup::Work> barrier(
+  virtual c10::intrusive_ptr<Work> barrier(
       const BarrierOptions& /* opts */ = BarrierOptions()) {
     TORCH_CHECK(
         false,
         c10::str(
-            "ProcessGroup ", getBackendName(), "does not support barrier"));
+            "ProcessGroup ", getBackendName(), " does not support barrier"));
   }
 
  protected:
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
index f15c6a8e9ba4..f4a054a68960 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@@ -516,7 +516,7 @@ ProcessGroupGloo::AsyncWork::AsyncWork(
     // Profiler: Pass nullptr as profilingTitle to parent constructor to
     // replace default profiler implementation with async version that reports
     // correct timestamps for work that is asynchronously executed.
-    : ProcessGroup::Work(-1, OpType::UNKNOWN, nullptr, inputTensors),
+    : Work(-1, OpType::UNKNOWN, nullptr, inputTensors),
       outputTensors_(std::move(outputTensors)),
       future_(createFutureAsOutput(outputTensors)) {
   if (profilingTitle != nullptr) {
@@ -537,7 +537,7 @@ void ProcessGroupGloo::AsyncWork::finishWorkGloo() {
 ProcessGroupGloo::SendWork::SendWork(
     at::Tensor& tensor,
     std::unique_ptr<::gloo::transport::UnboundBuffer> buffer)
-    : ProcessGroupGloo::Work(
+    : Work(
           -1,
           OpType::SEND,
           "gloo:send",
@@ -571,7 +571,7 @@ ProcessGroupGloo::RecvWork::RecvWork(
     at::Tensor& tensor,
     std::unique_ptr<::gloo::transport::UnboundBuffer> buffer,
     const char* profilingTitle)
-    : ProcessGroupGloo::Work(
+    : Work(
           -1,
           OpType::UNKNOWN,
           profilingTitle,
@@ -939,7 +939,7 @@ class AsyncBroadcastCUDAWork : public AsyncBroadcastWork {
 
 } // namespace
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::broadcast(
+c10::intrusive_ptr<Work> ProcessGroupGloo::broadcast(
     std::vector<at::Tensor>& inputs,
     const BroadcastOptions& opts) {
   static auto invalidArgument = [](const std::string& msg) {
@@ -1432,7 +1432,7 @@ class AsyncSparseAllreduceCUDAWork : public AsyncSparseAllreduceWork {
 
 } // namespace
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce(
+c10::intrusive_ptr<Work> ProcessGroupGloo::allreduce(
     std::vector<at::Tensor>& inputs,
     const AllreduceOptions& opts) {
   static auto invalidArgument = [](const std::string& msg) {
@@ -1493,7 +1493,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce(
   return work;
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce_coalesced(
+c10::intrusive_ptr<Work> ProcessGroupGloo::allreduce_coalesced(
     std::vector<at::Tensor>& tensors,
     const AllreduceCoalescedOptions& opts) {
   static auto invalidArgument = [](const std::string& msg) {
@@ -1662,7 +1662,7 @@ class AsyncReduceCUDAWork : public AsyncReduceWork {
 
 } // namespace
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::reduce(
+c10::intrusive_ptr<Work> ProcessGroupGloo::reduce(
     std::vector<at::Tensor>& inputs,
     const ReduceOptions& opts) {
   static auto invalidArgument = [](const std::string& msg) {
@@ -1839,7 +1839,7 @@ class AsyncAllgatherCUDAWork : public AsyncAllgatherWork {
 
 // Note: current CUDA implementation holds the assumption that the
 // tensors in the nested output tensor vectors are on the same device.
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::allgather(
+c10::intrusive_ptr<Work> ProcessGroupGloo::allgather(
     std::vector<std::vector<at::Tensor>>& outputs,
     std::vector<at::Tensor>& inputs,
     const AllgatherOptions& opts) {
@@ -1973,7 +1973,7 @@ class AsyncAllgatherCoalescedWork : public ProcessGroupGloo::AsyncWork {
 
 } // namespace
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::allgather_coalesced(
+c10::intrusive_ptr<Work> ProcessGroupGloo::allgather_coalesced(
     std::vector<std::vector<at::Tensor>>& output_lists,
     std::vector<at::Tensor>& input_list,
     const AllgatherOptions& /* unused */) {
@@ -2028,7 +2028,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::allgather_coalesced(
   return work;
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::_allgather_base(
+c10::intrusive_ptr<Work> ProcessGroupGloo::_allgather_base(
     at::Tensor& /*unused */,
     at::Tensor& /*unused */,
     const AllgatherOptions& /*unused */) {
@@ -2169,7 +2169,7 @@ class AsyncGatherCUDAWork : public AsyncGatherWork {
 
 } // namespace
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::gather(
+c10::intrusive_ptr<Work> ProcessGroupGloo::gather(
     std::vector<std::vector<at::Tensor>>& outputs,
     std::vector<at::Tensor>& inputs,
     const GatherOptions& opts) {
@@ -2354,7 +2354,7 @@ class AsyncScatterCUDAWork : public AsyncScatterWork {
 
 } // namespace
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::scatter(
+c10::intrusive_ptr<Work> ProcessGroupGloo::scatter(
     std::vector<at::Tensor>& outputs,
     std::vector<std::vector<at::Tensor>>& inputs,
     const ScatterOptions& opts) {
@@ -2416,7 +2416,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::scatter(
   return work;
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::reduce_scatter(
+c10::intrusive_ptr<Work> ProcessGroupGloo::reduce_scatter(
     std::vector<at::Tensor>& outputs,
     std::vector<std::vector<at::Tensor>>& inputs,
     const ReduceScatterOptions& opts) {
@@ -2547,7 +2547,7 @@ class AsyncAlltoallCUDAWork : public AsyncAlltoallWork {
 
 } // namespace
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::alltoall_base(
+c10::intrusive_ptr<Work> ProcessGroupGloo::alltoall_base(
     at::Tensor& outputTensor,
     at::Tensor& inputTensor,
     std::vector<int64_t>& outputCounts,
@@ -2610,7 +2610,7 @@ uint32_t checkTag(int32_t tag) {
   return (uint32_t)tag;
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::send(
+c10::intrusive_ptr<Work> ProcessGroupGloo::send(
     std::vector<at::Tensor>& tensors,
     int dstRank,
     int tag) {
@@ -2629,7 +2629,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::send(
   return c10::make_intrusive<SendWork>(tensor, std::move(buf));
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::recv(
+c10::intrusive_ptr<Work> ProcessGroupGloo::recv(
     std::vector<at::Tensor>& tensors,
     int srcRank,
     int tag) {
@@ -2648,7 +2648,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::recv(
   return c10::make_intrusive<RecvWork>(tensor, std::move(buf), "gloo:recv");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::recvAnysource(
+c10::intrusive_ptr<Work> ProcessGroupGloo::recvAnysource(
     std::vector<at::Tensor>& tensors,
     int tag) {
   auto& tensor = checkSingleTensor(tensors);
@@ -2711,8 +2711,7 @@ class AsyncBarrierWork : public ProcessGroupGloo::AsyncWork {
 
 } // namespace
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::barrier(
-    const BarrierOptions& opts) {
+c10::intrusive_ptr<Work> ProcessGroupGloo::barrier(const BarrierOptions& opts) {
   std::vector<c10::weak_intrusive_ptr<AsyncWork>> priorWork;
 
   // Snapshot all in progress and pending work as weak_ptr.
@@ -2766,8 +2765,8 @@ void ProcessGroupGloo::monitoredBarrier(
   auto startTime = std::chrono::steady_clock::now();
   auto worldSize = this->getSize();
   // Mappings of rank to recvWork/sendWork respectively.
-  std::map<int, c10::intrusive_ptr<ProcessGroup::Work>> recvWorkMap;
-  std::map<int, c10::intrusive_ptr<ProcessGroup::Work>> sendWorkMap;
+  std::map<int, c10::intrusive_ptr<Work>> recvWorkMap;
+  std::map<int, c10::intrusive_ptr<Work>> sendWorkMap;
   // Kick off recvWork and wait to unblock sendWork->wait() from non-zero ranks.
   // Failed/hanging ranks will not ack this call, letting rank 0 know about the
   // failure.
@@ -2775,69 +2774,67 @@ void ProcessGroupGloo::monitoredBarrier(
     recvWorkMap.insert({dstRank, recv(commTensor, dstRank, t1)});
   }
 
-  auto waitLoop =
-      [&](const std::map<int, c10::intrusive_ptr<ProcessGroup::Work>>& works) {
-        std::vector<int> processedRanks;
-        for (auto& work : works) {
-          bool rankResponded = false;
-          try {
-            // Note: if waitAllRanks=false, we recompute the time remaining in
-            // barrier and use this recomputed time in wait(). However, if
-            // waitAllRanks=true, we use the original timeout, since if we use
-            // up the entire timeout waiting for response from rank n, then we
-            // won't have any timeout left to query ranks beginning with n + 1.
-            auto remainingTime = getRemainingTime(
-                startTime, monitoredBarrierTimeout, waitAllRanks);
-            if (!waitAllRanks) {
-              checkRemainingTime(
-                  monitoredBarrierTimeout, remainingTime, processedRanks, rank);
-            }
-            work.second->wait(remainingTime);
-            rankResponded = true;
-          } catch (const std::exception& e) {
-            const std::string error = c10::str(
-                "[Rank 0]: Rank ",
-                work.first,
-                " failed to pass monitoredBarrier in ",
-                monitoredBarrierTimeout.count(),
-                " ms");
-            if (waitAllRanks) {
-              LOG(ERROR) << error;
-            } else {
-              logAndThrow(
-                  error,
-                  c10::str(error, "\n Original exception: \n", e.what()));
-            }
-          }
-          if (rankResponded) {
-            processedRanks.push_back(work.first);
-          }
+  auto waitLoop = [&](const std::map<int, c10::intrusive_ptr<Work>>& works) {
+    std::vector<int> processedRanks;
+    for (auto& work : works) {
+      bool rankResponded = false;
+      try {
+        // Note: if waitAllRanks=false, we recompute the time remaining in
+        // barrier and use this recomputed time in wait(). However, if
+        // waitAllRanks=true, we use the original timeout, since if we use
+        // up the entire timeout waiting for response from rank n, then we
+        // won't have any timeout left to query ranks beginning with n + 1.
+        auto remainingTime =
+            getRemainingTime(startTime, monitoredBarrierTimeout, waitAllRanks);
+        if (!waitAllRanks) {
+          checkRemainingTime(
+              monitoredBarrierTimeout, remainingTime, processedRanks, rank);
         }
-        // If we are collecting all failed ranks, check if we need to throw if
-        // some ranks have not responded.
-        // Ensure all ranks from 1, ... WORLD_SIZE -1 have been successfully
-        // processed.
-        auto rankFailure = (processedRanks.size() != size_ - 1);
-        if (waitAllRanks && rankFailure) {
-          std::vector<int> failedRanks;
-          for (const auto i : c10::irange(1, size_)) {
-            if (std::find(processedRanks.begin(), processedRanks.end(), i) ==
-                processedRanks.end()) {
-              failedRanks.push_back(i);
-            }
-          }
-
-          TORCH_INTERNAL_ASSERT(!failedRanks.empty());
-          const std::string ranksStr = c10::Join(", ", failedRanks);
-          const std::string error = c10::str(
-              "[Rank 0]: Ranks ",
-              ranksStr,
-              " failed to pass monitoredBarrier in ",
-              monitoredBarrierTimeout.count(),
-              " ms");
-          logAndThrow(error, error);
+        work.second->wait(remainingTime);
+        rankResponded = true;
+      } catch (const std::exception& e) {
+        const std::string error = c10::str(
+            "[Rank 0]: Rank ",
+            work.first,
+            " failed to pass monitoredBarrier in ",
+            monitoredBarrierTimeout.count(),
+            " ms");
+        if (waitAllRanks) {
+          LOG(ERROR) << error;
+        } else {
+          logAndThrow(
+              error, c10::str(error, "\n Original exception: \n", e.what()));
+        }
+      }
+      if (rankResponded) {
+        processedRanks.push_back(work.first);
+      }
+    }
+    // If we are collecting all failed ranks, check if we need to throw if
+    // some ranks have not responded.
+    // Ensure all ranks from 1, ... WORLD_SIZE -1 have been successfully
+    // processed.
+    auto rankFailure = (processedRanks.size() != size_ - 1);
+    if (waitAllRanks && rankFailure) {
+      std::vector<int> failedRanks;
+      for (const auto i : c10::irange(1, size_)) {
+        if (std::find(processedRanks.begin(), processedRanks.end(), i) ==
+            processedRanks.end()) {
+          failedRanks.push_back(i);
         }
-      };
+      }
+
+      TORCH_INTERNAL_ASSERT(!failedRanks.empty());
+      const std::string ranksStr = c10::Join(", ", failedRanks);
+      const std::string error = c10::str(
+          "[Rank 0]: Ranks ",
+          ranksStr,
+          " failed to pass monitoredBarrier in ",
+          monitoredBarrierTimeout.count(),
+          " ms");
+      logAndThrow(error, error);
+    }
+  };
 
   waitLoop(recvWorkMap);
   // If we've reached here successfully, this means all ranks have acked in
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
index 5c0c76afa245..3e2fd78e1de1 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
@@ -68,7 +68,7 @@ class TORCH_API ProcessGroupGloo : public ProcessGroup {
   //
   // FIXME: This probably should be called WorkGloo since the work is executed in sync mode
   // by a background thread.
-  class TORCH_API AsyncWork : public ProcessGroup::Work {
+  class TORCH_API AsyncWork : public Work {
    public:
     explicit AsyncWork(
         std::vector<std::vector<at::Tensor>> outputTensors,
@@ -144,7 +144,7 @@ class TORCH_API ProcessGroupGloo : public ProcessGroup {
   // recv operation. It keeps a reference to the tensor it is
   // operating on to prevent it from being deallocated while the
   // operation is still in flight.
-  class TORCH_API SendWork : public ProcessGroup::Work {
+  class TORCH_API SendWork : public Work {
    public:
     explicit SendWork(
         at::Tensor& tensor,
@@ -159,7 +159,7 @@ class TORCH_API ProcessGroupGloo : public ProcessGroup {
     std::unique_ptr<::gloo::transport::UnboundBuffer> buffer_;
   };
 
-  class TORCH_API RecvWork : public ProcessGroup::Work {
+  class TORCH_API RecvWork : public Work {
    public:
     explicit RecvWork(
         at::Tensor& tensor,
@@ -226,75 +226,75 @@ class TORCH_API ProcessGroupGloo : public ProcessGroup {
     return options_;
   }
 
-  c10::intrusive_ptr<ProcessGroup::Work> broadcast(
+  c10::intrusive_ptr<Work> broadcast(
       std::vector<at::Tensor>& tensors,
       const BroadcastOptions& opts = BroadcastOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allreduce(
+  c10::intrusive_ptr<Work> allreduce(
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allreduce_coalesced(
+  c10::intrusive_ptr<Work> allreduce_coalesced(
       std::vector<at::Tensor>& tensors,
       const AllreduceCoalescedOptions& opts =
           AllreduceCoalescedOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> reduce(
+  c10::intrusive_ptr<Work> reduce(
       std::vector<at::Tensor>& tensors,
       const ReduceOptions& opts = ReduceOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allgather(
+  c10::intrusive_ptr<Work> allgather(
       std::vector<std::vector<at::Tensor>>& outputs,
       std::vector<at::Tensor>& inputs,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> _allgather_base(
+  c10::intrusive_ptr<Work> _allgather_base(
       at::Tensor& outputBuffer,
       at::Tensor& inputBuffer,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allgather_coalesced(
+  c10::intrusive_ptr<Work> allgather_coalesced(
       std::vector<std::vector<at::Tensor>>& output_lists,
       std::vector<at::Tensor>& input_list,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> gather(
+  c10::intrusive_ptr<Work> gather(
       std::vector<std::vector<at::Tensor>>& outputs,
       std::vector<at::Tensor>& inputs,
       const GatherOptions& opts = GatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> scatter(
+  c10::intrusive_ptr<Work> scatter(
       std::vector<at::Tensor>& outputs,
       std::vector<std::vector<at::Tensor>>& inputs,
       const ScatterOptions& opts = ScatterOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(
+  c10::intrusive_ptr<Work> reduce_scatter(
       std::vector<at::Tensor>& outputs,
       std::vector<std::vector<at::Tensor>>& inputs,
       const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> alltoall_base(
+  c10::intrusive_ptr<Work> alltoall_base(
       at::Tensor& outputTensor,
       at::Tensor& inputTensor,
       std::vector<int64_t>& outputCounts,
       std::vector<int64_t>& inputCounts,
       const AllToAllOptions& opts = AllToAllOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> send(
+  c10::intrusive_ptr<Work> send(
       std::vector<at::Tensor>& tensors,
       int dstRank,
       int tag) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> recv(
+  c10::intrusive_ptr<Work> recv(
       std::vector<at::Tensor>& tensors,
       int srcRank,
       int tag) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> recvAnysource(
+  c10::intrusive_ptr<Work> recvAnysource(
       std::vector<at::Tensor>& tensors,
       int tag) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> barrier(
+  c10::intrusive_ptr<Work> barrier(
       const BarrierOptions& opts = BarrierOptions()) override;
 
   const std::unique_ptr<::gloo::rendezvous::Store>& _getStore() const {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
index 19037ba338ca..683a09f1762e 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
@@ -122,7 +122,7 @@ ProcessGroupMPI::AsyncWork::AsyncWork(
     std::vector<at::Tensor> outputTensors,
     const char* profilingTitle,
     const c10::optional<std::vector<at::Tensor>>& inputTensors)
-    : ProcessGroup::Work(-1, OpType::UNKNOWN, profilingTitle, inputTensors),
+    : Work(-1, OpType::UNKNOWN, profilingTitle, inputTensors),
       outputTensors_(std::move(outputTensors)),
       request_(request) {
   memset(&status_, 0, sizeof(status_));
@@ -176,9 +176,9 @@ bool ProcessGroupMPI::AsyncWork::wait(std::chrono::milliseconds /* unused */) {
   if (request_ == MPI_REQUEST_NULL) {
     // AsyncWork needs to manually call profiling end callbacks if they are set,
     // since it does not call ProcessGroup::finish().
-    if (ProcessGroup::Work::recordFunctionEndCallback_) {
-      ProcessGroup::Work::recordFunctionEndCallback_();
-      ProcessGroup::Work::recordFunctionEndCallback_ = nullptr;
+    if (Work::recordFunctionEndCallback_) {
+      Work::recordFunctionEndCallback_();
+      Work::recordFunctionEndCallback_ = nullptr;
     }
     return true;
   }
@@ -189,9 +189,9 @@ bool ProcessGroupMPI::AsyncWork::wait(std::chrono::milliseconds /* unused */) {
 
   // AsyncWork needs to manually call profiling end callbacks if they are set,
   // since it does not call ProcessGroup::finish().
-  if (ProcessGroup::Work::recordFunctionEndCallback_) {
-    ProcessGroup::Work::recordFunctionEndCallback_();
-    ProcessGroup::Work::recordFunctionEndCallback_ = nullptr;
+  if (Work::recordFunctionEndCallback_) {
+    Work::recordFunctionEndCallback_();
+    Work::recordFunctionEndCallback_ = nullptr;
   }
 
   if (!ok) {
@@ -370,7 +370,7 @@ void ProcessGroupMPI::runLoop() {
   }
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::enqueue(
+c10::intrusive_ptr<Work> ProcessGroupMPI::enqueue(
     std::unique_ptr<WorkEntry> entry,
     const char* profilingTitle,
     const c10::optional<std::vector<at::Tensor>>& inputTensors) {
@@ -383,7 +383,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::enqueue(
   return work;
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::broadcast(
+c10::intrusive_ptr<Work> ProcessGroupMPI::broadcast(
     std::vector<at::Tensor>& tensors,
     const BroadcastOptions& opts) {
   checkSingleTensor(tensors);
@@ -407,7 +407,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::broadcast(
       c10::optional<std::vector<at::Tensor>>(tensors));
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::allreduce(
+c10::intrusive_ptr<Work> ProcessGroupMPI::allreduce(
     std::vector<at::Tensor>& tensors,
     const AllreduceOptions& opts) {
   checkSingleTensor(tensors);
@@ -433,13 +433,13 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::allreduce(
       c10::optional<std::vector<at::Tensor>>(tensors));
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::allreduce_coalesced(
+c10::intrusive_ptr<Work> ProcessGroupMPI::allreduce_coalesced(
     std::vector<at::Tensor>& tensors,
     const AllreduceCoalescedOptions& opts) {
   TORCH_CHECK(false, "allreduce_coalesced is currently not supported with MPI");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::reduce(
+c10::intrusive_ptr<Work> ProcessGroupMPI::reduce(
     std::vector<at::Tensor>& tensors,
     const ReduceOptions& opts) {
   checkSingleTensor(tensors);
@@ -470,7 +470,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::reduce(
       c10::optional<std::vector<at::Tensor>>(tensors));
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::allgather(
+c10::intrusive_ptr<Work> ProcessGroupMPI::allgather(
     std::vector<std::vector<at::Tensor>>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const AllgatherOptions& opts) {
@@ -519,14 +519,14 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::allgather(
       c10::optional<std::vector<at::Tensor>>(inputTensors));
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::allgather_coalesced(
+c10::intrusive_ptr<Work> ProcessGroupMPI::allgather_coalesced(
     std::vector<std::vector<at::Tensor>>& /* unused */,
     std::vector<at::Tensor>& /* unused */,
     const AllgatherOptions& /* unused */) {
   TORCH_CHECK(false, "ProcessGroupMPI does not support allgather_coalesced");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::gather(
+c10::intrusive_ptr<Work> ProcessGroupMPI::gather(
     std::vector<std::vector<at::Tensor>>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const GatherOptions& opts) {
@@ -602,7 +602,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::gather(
   }
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::scatter(
+c10::intrusive_ptr<Work> ProcessGroupMPI::scatter(
     std::vector<at::Tensor>& outputTensors,
     std::vector<std::vector<at::Tensor>>& inputTensors,
     const ScatterOptions& opts) {
@@ -679,14 +679,14 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::scatter(
   }
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::reduce_scatter(
+c10::intrusive_ptr<Work> ProcessGroupMPI::reduce_scatter(
     std::vector<at::Tensor>& outputTensors,
     std::vector<std::vector<at::Tensor>>& inputTensors,
     const ReduceScatterOptions& opts) {
   TORCH_CHECK(false, "ProcessGroupMPI does not support reduce_scatter");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::alltoall_base(
+c10::intrusive_ptr<Work> ProcessGroupMPI::alltoall_base(
     at::Tensor& outputTensor,
     at::Tensor& inputTensor,
     std::vector<int64_t>& outputSplitSizes,
@@ -769,7 +769,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::alltoall_base(
   }
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::alltoall(
+c10::intrusive_ptr<Work> ProcessGroupMPI::alltoall(
     std::vector<at::Tensor>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const AllToAllOptions& opts) {
@@ -829,7 +829,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::alltoall(
       c10::optional<std::vector<at::Tensor>>(inputTensors));
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::send(
+c10::intrusive_ptr<Work> ProcessGroupMPI::send(
     std::vector<at::Tensor>& tensors,
     int dstRank,
     int tag) {
@@ -858,7 +858,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::send(
       c10::optional<std::vector<at::Tensor>>(tensors));
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::recv(
+c10::intrusive_ptr<Work> ProcessGroupMPI::recv(
     std::vector<at::Tensor>& tensors,
     int srcRank,
     int tag) {
@@ -887,7 +887,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::recv(
       c10::optional<std::vector<at::Tensor>>(tensors));
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::recvAnysource(
+c10::intrusive_ptr<Work> ProcessGroupMPI::recvAnysource(
     std::vector<at::Tensor>& tensors,
     int tag) {
   checkSingleTensor(tensors);
@@ -915,8 +915,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::recvAnysource(
       c10::optional<std::vector<at::Tensor>>(tensors));
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::barrier(
-    const BarrierOptions& opts) {
+c10::intrusive_ptr<Work> ProcessGroupMPI::barrier(const BarrierOptions& opts) {
   std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
       [this](std::unique_ptr<WorkEntry>& entry) {
         std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
@@ -927,7 +926,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::barrier(
   return enqueue(std::move(entry), "mpi:barrier", c10::nullopt);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::_allgather_base(
+c10::intrusive_ptr<Work> ProcessGroupMPI::_allgather_base(
     at::Tensor& /*unused */,
     at::Tensor& /*unused */,
     const AllgatherOptions& /*unused */) {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp b/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
index 93bb3113f00c..bfdb60859f90 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
@@ -82,14 +82,14 @@ struct WorkEntry {
 // ProcessGroupMPI will automatically detect this support.
 class TORCH_API ProcessGroupMPI : public ProcessGroup {
  public:
-  class WorkMPI : public ProcessGroup::Work {
+  class WorkMPI : public Work {
    public:
     explicit WorkMPI(
         std::vector<at::Tensor> outputTensors,
         const char* profilingTitle = nullptr,
         const c10::optional<std::vector<at::Tensor>>& inputTensors =
             c10::nullopt)
-        : ProcessGroup::Work(-1, OpType::UNKNOWN, profilingTitle, inputTensors),
+        : Work(-1, OpType::UNKNOWN, profilingTitle, inputTensors),
           outputTensors_(std::move(outputTensors)),
           future_(c10::make_intrusive<at::ivalue::Future>(
               c10::ListType::create(c10::TensorType::get()))) {}
@@ -109,7 +109,7 @@ class TORCH_API ProcessGroupMPI : public ProcessGroup {
     c10::intrusive_ptr<at::ivalue::Future> future_;
   };
 
-  class AsyncWork : public ProcessGroup::Work {
+  class AsyncWork : public Work {
    public:
     AsyncWork(
         MPI_Request request,
@@ -153,80 +153,80 @@ class TORCH_API ProcessGroupMPI : public ProcessGroup {
     return std::string(MPI_BACKEND_NAME);
   }
 
-  c10::intrusive_ptr<ProcessGroup::Work> broadcast(
+  c10::intrusive_ptr<Work> broadcast(
       std::vector<at::Tensor>& data,
       const BroadcastOptions& opts = BroadcastOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allreduce(
+  c10::intrusive_ptr<Work> allreduce(
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allreduce_coalesced(
+  c10::intrusive_ptr<Work> allreduce_coalesced(
       std::vector<at::Tensor>& tensors,
       const AllreduceCoalescedOptions& opts =
           AllreduceCoalescedOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> reduce(
+  c10::intrusive_ptr<Work> reduce(
       std::vector<at::Tensor>& tensors,
       const ReduceOptions& opts = ReduceOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allgather(
+  c10::intrusive_ptr<Work> allgather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> _allgather_base(
+  c10::intrusive_ptr<Work> _allgather_base(
       at::Tensor& outputbuffer,
       at::Tensor& inputbuffer,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allgather_coalesced(
+  c10::intrusive_ptr<Work> allgather_coalesced(
       std::vector<std::vector<at::Tensor>>& outputTensorLists,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> gather(
+  c10::intrusive_ptr<Work> gather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const GatherOptions& opts = GatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> scatter(
+  c10::intrusive_ptr<Work> scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ScatterOptions& opts = ScatterOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(
+  c10::intrusive_ptr<Work> reduce_scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> alltoall_base(
+  c10::intrusive_ptr<Work> alltoall_base(
       at::Tensor& outputTensor,
       at::Tensor& inputTensor,
       std::vector<int64_t>& outputSplitSizes,
       std::vector<int64_t>& inputSplitSizes,
       const AllToAllOptions& opts = AllToAllOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> alltoall(
+  c10::intrusive_ptr<Work> alltoall(
       std::vector<at::Tensor>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const AllToAllOptions& opts = AllToAllOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> send(
+  c10::intrusive_ptr<Work> send(
       std::vector<at::Tensor>& tensors,
       int dstRank,
       int tag) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> recv(
+  c10::intrusive_ptr<Work> recv(
       std::vector<at::Tensor>& tensors,
       int srcRank,
       int tag) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> recvAnysource(
+  c10::intrusive_ptr<Work> recvAnysource(
       std::vector<at::Tensor>& tensor,
       int tag) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> barrier(
+  c10::intrusive_ptr<Work> barrier(
       const BarrierOptions& opts = BarrierOptions()) override;
 
   // Creating a new ProcessGroupMPI, will initiialize MPI if not initialized
@@ -241,7 +241,7 @@ class TORCH_API ProcessGroupMPI : public ProcessGroup {
   // Helper function that is called by the destructor
   void destroy();
 
-  c10::intrusive_ptr<ProcessGroup::Work> enqueue(
+  c10::intrusive_ptr<Work> enqueue(
       std::unique_ptr<WorkEntry> entry,
       const char* profilingTitle = nullptr,
       const c10::optional<std::vector<at::Tensor>>& inputTensors = c10::nullopt);
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 27b5aafb86fb..cd5207471312 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -585,7 +585,7 @@ ProcessGroupNCCL::CoalescedWorkNCCL::~CoalescedWorkNCCL() = default;
 
 c10::intrusive_ptr<ProcessGroupNCCL::CoalescedWorkNCCL> ProcessGroupNCCL::
     initCoalescedWork(
-        const std::vector<c10::intrusive_ptr<ProcessGroup::Work>>& works,
+        const std::vector<c10::intrusive_ptr<Work>>& works,
         int rank,
         OpType opType) {
   std::vector<ProcessGroupNCCL::WorkNCCL> ncclWorks;
@@ -1496,7 +1496,7 @@ void ProcessGroupNCCL::startCoalescing() {
 }
 
 void ProcessGroupNCCL::endCoalescing(
-    std::vector<c10::intrusive_ptr<ProcessGroup::Work>>& reqs) {
+    std::vector<c10::intrusive_ptr<Work>>& reqs) {
   groupEnd();
   if (reqs.size() != coalescedDevices_.size()) {
     TORCH_CHECK(false, "Number of requests do not match number of collectives");
@@ -1518,7 +1518,7 @@ void ProcessGroupNCCL::endCoalescing(
 }
 
 template <typename Fn, typename PreProcess, typename PostProcess>
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
     std::vector<at::Tensor>& inputs,
     std::vector<at::Tensor>& outputs,
     Fn fn,
@@ -1648,7 +1648,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
 }
 
 template <typename Fn, typename PreProcess, typename PostProcess>
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
     std::vector<at::Tensor>& tensors,
     Fn fn,
     int peer,
@@ -1777,7 +1777,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
 }
 
 template <typename Fn>
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
     std::vector<at::Tensor>& inputs,
     std::vector<at::Tensor>& outputs,
     Fn fn,
@@ -1794,7 +1794,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
 }
 
 template <typename Fn>
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
     std::vector<at::Tensor>& tensor,
     Fn fn,
     int peer,
@@ -1810,7 +1810,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
       profilingTitle);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::allreduce_impl(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_impl(
     std::vector<at::Tensor>& tensors,
     const AllreduceOptions& opts) {
   int dev_in_group = 0;
@@ -1837,7 +1837,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::allreduce_impl(
       "nccl:all_reduce");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::allreduce(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce(
     std::vector<at::Tensor>& tensors,
     const AllreduceOptions& opts) {
   check_gpu_tensors_different_devices(tensors);
@@ -1856,7 +1856,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::allreduce(
   return allreduce_impl(tensors, opts);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::allreduce_coalesced(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
     std::vector<at::Tensor>& tensors,
     const AllreduceCoalescedOptions& opts) {
   auto total_numel = check_gpu_tensors_same_device(tensors);
@@ -1875,7 +1875,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::allreduce_coalesced(
   return allreduce_impl(tensors, opts);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::broadcast(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::broadcast(
     std::vector<at::Tensor>& tensors,
     const BroadcastOptions& opts) {
   check_gpu_tensors_different_devices(tensors);
@@ -1918,7 +1918,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::broadcast(
 // Since all_gather provides an out-of-place API, an all_gather_v
 // semantic implemented inside pg_nccl.all_gather also needs to support
 // out-of-place, for which an out-of-place broadcast is required to be added
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::_broadcast_oop(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::_broadcast_oop(
     std::vector<at::Tensor>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const BroadcastOptions& opts) {
@@ -1964,7 +1964,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::_broadcast_oop(
       "nccl:_broadcast_oop");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::reduce(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce(
     std::vector<at::Tensor>& tensors,
     const ReduceOptions& opts) {
   check_gpu_tensors_different_devices(tensors);
@@ -2012,7 +2012,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::reduce(
 // Since reduce_scatter provides an out-of-place API, a reduce_scatter_v
 // semantic implemented inside pg_nccl.reduce_scatter also needs to support
 // out-of-place, for which an out-of-place reduce is required to be added
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::_reduce_oop(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_oop(
     std::vector<at::Tensor>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const ReduceOptions& opts) {
@@ -2062,7 +2062,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::_reduce_oop(
       "nccl:_reduce_oop");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::allgather(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
     std::vector<std::vector<at::Tensor>>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const AllgatherOptions& opts) {
@@ -2123,7 +2123,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::allgather(
   } else {
     const auto num_devices = outputTensors.size();
     const auto num_reduces = outputTensors[0].size();
-    std::vector<c10::intrusive_ptr<ProcessGroup::Work>> works;
+    std::vector<c10::intrusive_ptr<Work>> works;
     startCoalescing();
     for (const auto i : c10::irange(num_reduces)) {
       std::vector<at::Tensor> inputs_multi_dev(num_devices);
@@ -2149,14 +2149,14 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::allgather(
   }
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::allgather_coalesced(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather_coalesced(
     std::vector<std::vector<at::Tensor>>& /* unused */,
     std::vector<at::Tensor>& /* unused */,
     const AllgatherOptions& /* unused */) {
   TORCH_CHECK(false, "ProcessGroupNCCL does not support allgather_coalesced");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::reduce_scatter(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
     std::vector<at::Tensor>& outputTensors,
     std::vector<std::vector<at::Tensor>>& inputTensors,
     const ReduceScatterOptions& opts) {
@@ -2222,7 +2222,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::reduce_scatter(
   } else {
     const auto num_devices = inputTensors.size();
     const auto num_reduces = inputTensors[0].size();
-    std::vector<c10::intrusive_ptr<ProcessGroup::Work>> works;
+    std::vector<c10::intrusive_ptr<Work>> works;
     startCoalescing();
     for (const auto i : c10::irange(num_reduces)) {
       std::vector<at::Tensor> inputs_multi_dev(num_devices);
@@ -2248,7 +2248,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::reduce_scatter(
   }
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::_reduce_scatter_base(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_scatter_base(
     at::Tensor& outputTensor,
     at::Tensor& inputTensor,
     const ReduceScatterOptions& opts) {
@@ -2306,8 +2306,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::_reduce_scatter_base(
       "nccl:_reduce_scatter_base");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::barrier(
-    const BarrierOptions& opts) {
+c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
   RECORD_PARAM_COMMS(
       rank_, // rank
       "barrier", // colName
@@ -2370,7 +2369,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::barrier(
 }
 
 #ifdef ENABLE_NCCL_P2P_SUPPORT
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
     at::Tensor& outputTensor,
     at::Tensor& inputTensor,
     std::vector<int64_t>& outputSplitSizes,
@@ -2458,7 +2457,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
   }
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall(
     std::vector<at::Tensor>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const AllToAllOptions& /* unused */) {
@@ -2486,7 +2485,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall(
       OpType::ALLTOALL);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::send(
     std::vector<at::Tensor>& tensors,
     int dstRank,
     int /* unused */) {
@@ -2506,7 +2505,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
   return ret;
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::recv(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::recv(
     std::vector<at::Tensor>& tensors,
     int srcRank,
     int /* unused */) {
@@ -2526,7 +2525,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::recv(
   return ret;
 }
 #else
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
     at::Tensor& /* unused */,
     at::Tensor& /* unused */,
     std::vector<int64_t>& /* unused */,
@@ -2537,7 +2536,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
       "ProcessGroupNCCL only supports alltoall* for NCCL lib version >= 2.7.0");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall(
     std::vector<at::Tensor>& /* unused */,
     std::vector<at::Tensor>& /* unused */,
     const AllToAllOptions& /* unused */) {
@@ -2546,7 +2545,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall(
       "ProcessGroupNCCL only supports alltoall* for NCCL lib version >= 2.7.0");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::send(
     std::vector<at::Tensor>& /* unused */,
     int /* unused */,
     int /* unused */) {
@@ -2555,7 +2554,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
       "ProcessGroupNCCL only supports send for NCCL lib version >= 2.7.0");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::recv(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::recv(
     std::vector<at::Tensor>& /* unused */,
     int /* unused */,
     int /* unused */) {
@@ -2579,7 +2578,7 @@ void ProcessGroupNCCL::groupEnd() {
   --ncclActiveGroupCounter_;
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::gather(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::gather(
     std::vector<std::vector<at::Tensor>>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const GatherOptions& opts) {
@@ -2654,7 +2653,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::gather(
       "nccl:gather");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::scatter(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::scatter(
     std::vector<at::Tensor>& outputTensors,
     std::vector<std::vector<at::Tensor>>& inputTensors,
     const ScatterOptions& opts) {
@@ -2731,13 +2730,13 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::scatter(
       "nccl:scatter");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::recvAnysource(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::recvAnysource(
     std::vector<at::Tensor>& /* unused */,
     int /* unused */) {
   TORCH_CHECK(false, "ProcessGroupNCCL does not support recvAnysource");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::_allgather_base(
+c10::intrusive_ptr<Work> ProcessGroupNCCL::_allgather_base(
     at::Tensor& output_tensor,
     at::Tensor& input_tensor,
     const AllgatherOptions& /*unused */) {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index ddd8995a291d..3db90d32629b 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -85,7 +85,7 @@ enum ErrorHandlingMode { NoHandling = 0, TearDown = 1, CleanUpOnly = 2 };
 //   // Now continue on other work in the current stream.
 class TORCH_API ProcessGroupNCCL : public ProcessGroup {
  public:
-  class WorkNCCL : public ProcessGroup::Work,
+  class WorkNCCL : public Work,
     public std::enable_shared_from_this<WorkNCCL> {
    public:
     // Constructor takes a list of CUDA devices
@@ -223,7 +223,7 @@ class TORCH_API ProcessGroupNCCL : public ProcessGroup {
   };
 
   class CoalescedWorkNCCL
-      : public ProcessGroup::Work,
+      : public Work,
         public std::enable_shared_from_this<CoalescedWorkNCCL> {
    public:
     // Constructor takes a list of WorkNCCL works
@@ -304,81 +304,81 @@ class TORCH_API ProcessGroupNCCL : public ProcessGroup {
   void startCoalescing() override;
 
   void endCoalescing(
-      std::vector<c10::intrusive_ptr<ProcessGroup::Work>>& reqs) override;
+      std::vector<c10::intrusive_ptr<Work>>& reqs) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> broadcast(
+  c10::intrusive_ptr<Work> broadcast(
       std::vector<at::Tensor>& tensors,
       const BroadcastOptions& opts = BroadcastOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> _broadcast_oop(
+  c10::intrusive_ptr<Work> _broadcast_oop(
       std::vector<at::Tensor>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const BroadcastOptions& opts = BroadcastOptions());
 
-  c10::intrusive_ptr<ProcessGroup::Work> allreduce(
+  c10::intrusive_ptr<Work> allreduce(
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allreduce_coalesced(
+  c10::intrusive_ptr<Work> allreduce_coalesced(
       std::vector<at::Tensor>& tensors,
       const AllreduceCoalescedOptions& opts =
           AllreduceCoalescedOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> reduce(
+  c10::intrusive_ptr<Work> reduce(
       std::vector<at::Tensor>& tensors,
       const ReduceOptions& opts = ReduceOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> _reduce_oop(
+  c10::intrusive_ptr<Work> _reduce_oop(
       std::vector<at::Tensor>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const ReduceOptions& opts = ReduceOptions());
 
-  c10::intrusive_ptr<ProcessGroup::Work> allgather(
+  c10::intrusive_ptr<Work> allgather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> _allgather_base(
+  c10::intrusive_ptr<Work> _allgather_base(
       at::Tensor& outputbuffer,
       at::Tensor& inputbuffer,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allgather_coalesced(
+  c10::intrusive_ptr<Work> allgather_coalesced(
       std::vector<std::vector<at::Tensor>>& outputTensorLists,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(
+  c10::intrusive_ptr<Work> reduce_scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> _reduce_scatter_base(
+  c10::intrusive_ptr<Work> _reduce_scatter_base(
       at::Tensor& outputTensor,
       at::Tensor& inputTensor,
       const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> barrier(
+  c10::intrusive_ptr<Work> barrier(
       const BarrierOptions& opts = BarrierOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> alltoall_base(
+  c10::intrusive_ptr<Work> alltoall_base(
       at::Tensor& outputTensor,
       at::Tensor& inputTensor,
       std::vector<int64_t>& outputSplitSizes,
       std::vector<int64_t>& inputSplitSizes,
       const AllToAllOptions& opts = AllToAllOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> alltoall(
+  c10::intrusive_ptr<Work> alltoall(
       std::vector<at::Tensor>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const AllToAllOptions& opts = AllToAllOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> send(
+  c10::intrusive_ptr<Work> send(
       std::vector<at::Tensor>& tensors,
       int dstRank,
       int tag) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> recv(
+  c10::intrusive_ptr<Work> recv(
       std::vector<at::Tensor>& tensors,
       int srcRank,
       int tag) override;
@@ -388,17 +388,17 @@ class TORCH_API ProcessGroupNCCL : public ProcessGroup {
   static void groupEnd();
 
   // Unsupported Ops
-  c10::intrusive_ptr<ProcessGroup::Work> gather(
+  c10::intrusive_ptr<Work> gather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const GatherOptions& opts = GatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> scatter(
+  c10::intrusive_ptr<Work> scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ScatterOptions& opts = ScatterOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> recvAnysource(
+  c10::intrusive_ptr<Work> recvAnysource(
       std::vector<at::Tensor>& tensors,
       int tag) override;
 
@@ -444,7 +444,7 @@ class TORCH_API ProcessGroupNCCL : public ProcessGroup {
 
   virtual c10::intrusive_ptr<ProcessGroupNCCL::CoalescedWorkNCCL>
   initCoalescedWork(
-      const std::vector<c10::intrusive_ptr<ProcessGroup::Work>>& works,
+      const std::vector<c10::intrusive_ptr<Work>>& works,
       int rank,
       OpType opType);
 
@@ -456,14 +456,14 @@ class TORCH_API ProcessGroupNCCL : public ProcessGroup {
   //                    ncclComm_t, at::cuda::CUDAStream&);
   //    void {pre,post}(std::vector<at::cuda::CUDAStream&>);
   template <typename Fn>
-  c10::intrusive_ptr<ProcessGroup::Work> collective(
+  c10::intrusive_ptr<Work> collective(
       std::vector<at::Tensor>& input,
       std::vector<at::Tensor>& output,
       Fn fn,
       OpType opType,
       const char* profilingTitle = nullptr);
   template <typename Fn, typename PreProcess, typename PostProcess>
-  c10::intrusive_ptr<ProcessGroup::Work> collective(
+  c10::intrusive_ptr<Work> collective(
       std::vector<at::Tensor>& input,
       std::vector<at::Tensor>& output,
       Fn fn,
@@ -476,14 +476,14 @@ class TORCH_API ProcessGroupNCCL : public ProcessGroup {
   // primitives. It is the same structure as the helper used for collective
   // communicaiton primitives.
   template <typename Fn>
-  c10::intrusive_ptr<ProcessGroup::Work> pointToPoint(
+  c10::intrusive_ptr<Work> pointToPoint(
       std::vector<at::Tensor>& tensor,
       Fn fn,
       int peer,
       OpType opType,
       const char* profilingTitle = nullptr);
   template <typename Fn, typename PreProcess, typename PostProcess>
-  c10::intrusive_ptr<ProcessGroup::Work> pointToPoint(
+  c10::intrusive_ptr<Work> pointToPoint(
       std::vector<at::Tensor>& tensor,
       Fn fn,
       int peer,
@@ -492,7 +492,7 @@ class TORCH_API ProcessGroupNCCL : public ProcessGroup {
       PostProcess post,
       const char* profilingTitle);
 
-  c10::intrusive_ptr<ProcessGroup::Work> allreduce_impl(
+  c10::intrusive_ptr<Work> allreduce_impl(
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions());
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupRoundRobin.cpp b/torch/csrc/distributed/c10d/ProcessGroupRoundRobin.cpp
index 4e017613200a..d52cd52f1d7e 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupRoundRobin.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupRoundRobin.cpp
@@ -17,68 +17,66 @@ ProcessGroupRoundRobin::ProcessGroupRoundRobin(
 
 ProcessGroupRoundRobin::~ProcessGroupRoundRobin() {}
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::broadcast(
+c10::intrusive_ptr<Work> ProcessGroupRoundRobin::broadcast(
     std::vector<at::Tensor>& tensors,
     const BroadcastOptions& opts) {
   return next()->broadcast(tensors, opts);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::allreduce(
+c10::intrusive_ptr<Work> ProcessGroupRoundRobin::allreduce(
     std::vector<at::Tensor>& tensors,
     const AllreduceOptions& opts) {
   return next()->allreduce(tensors, opts);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::
-    allreduce_coalesced(
-        std::vector<at::Tensor>& tensors,
-        const AllreduceCoalescedOptions& opts) {
+c10::intrusive_ptr<Work> ProcessGroupRoundRobin::allreduce_coalesced(
+    std::vector<at::Tensor>& tensors,
+    const AllreduceCoalescedOptions& opts) {
   return next()->allreduce_coalesced(tensors, opts);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::reduce(
+c10::intrusive_ptr<Work> ProcessGroupRoundRobin::reduce(
     std::vector<at::Tensor>& tensors,
     const ReduceOptions& opts) {
   return next()->reduce(tensors, opts);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::allgather(
+c10::intrusive_ptr<Work> ProcessGroupRoundRobin::allgather(
     std::vector<std::vector<at::Tensor>>& outputs,
     std::vector<at::Tensor>& inputs,
     const AllgatherOptions& opts) {
   return next()->allgather(outputs, inputs, opts);
 };
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::
-    allgather_coalesced(
-        std::vector<std::vector<at::Tensor>>& outputTensorLists,
-        std::vector<at::Tensor>& inputTensors,
-        const AllgatherOptions& opts) {
+c10::intrusive_ptr<Work> ProcessGroupRoundRobin::allgather_coalesced(
+    std::vector<std::vector<at::Tensor>>& outputTensorLists,
+    std::vector<at::Tensor>& inputTensors,
+    const AllgatherOptions& opts) {
   return next()->allgather(outputTensorLists, inputTensors, opts);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::gather(
+c10::intrusive_ptr<Work> ProcessGroupRoundRobin::gather(
     std::vector<std::vector<at::Tensor>>& outputs,
     std::vector<at::Tensor>& inputs,
     const GatherOptions& opts) {
   return next()->gather(outputs, inputs, opts);
 };
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::scatter(
+c10::intrusive_ptr<Work> ProcessGroupRoundRobin::scatter(
     std::vector<at::Tensor>& outputs,
     std::vector<std::vector<at::Tensor>>& inputs,
     const ScatterOptions& opts) {
   return next()->scatter(outputs, inputs, opts);
 };
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::reduce_scatter(
+c10::intrusive_ptr<Work> ProcessGroupRoundRobin::reduce_scatter(
     std::vector<at::Tensor>& outputs,
     std::vector<std::vector<at::Tensor>>& inputs,
     const ReduceScatterOptions& opts) {
   return next()->reduce_scatter(outputs, inputs, opts);
 };
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::alltoall_base(
+c10::intrusive_ptr<Work> ProcessGroupRoundRobin::alltoall_base(
     at::Tensor& outputTensor,
     at::Tensor& inputTensor,
     std::vector<int64_t>& outputSplitSizes,
@@ -88,27 +86,27 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::alltoall_base(
       outputTensor, inputTensor, outputSplitSizes, inputSplitSizes, opts);
 };
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::send(
+c10::intrusive_ptr<Work> ProcessGroupRoundRobin::send(
     std::vector<at::Tensor>& /* unused */,
     int /* unused */,
     int /* unused */) {
   TORCH_CHECK(false, "ProcessGroupRoundRobin does not support send");
 };
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::recv(
+c10::intrusive_ptr<Work> ProcessGroupRoundRobin::recv(
     std::vector<at::Tensor>& /* unused */,
     int /* unused */,
     int /* unused */) {
   TORCH_CHECK(false, "ProcessGroupRoundRobin does not support recv");
 };
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::recvAnysource(
+c10::intrusive_ptr<Work> ProcessGroupRoundRobin::recvAnysource(
     std::vector<at::Tensor>& /* unused */,
     int /* unused */) {
   TORCH_CHECK(false, "ProcessGroupRoundRobin does not support recv");
 };
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::barrier(
+c10::intrusive_ptr<Work> ProcessGroupRoundRobin::barrier(
     const BarrierOptions& /* unused */) {
   TORCH_CHECK(false, "ProcessGroupRoundRobin does not support barrier");
 };
@@ -122,7 +120,7 @@ const c10::intrusive_ptr<ProcessGroup>& ProcessGroupRoundRobin::next() {
   return processGroup;
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::_allgather_base(
+c10::intrusive_ptr<Work> ProcessGroupRoundRobin::_allgather_base(
     at::Tensor& /*unused */,
     at::Tensor& /*unused */,
     const AllgatherOptions& /*unused */) {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupRoundRobin.hpp b/torch/csrc/distributed/c10d/ProcessGroupRoundRobin.hpp
index d5450badaac5..6ed48fc0432d 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupRoundRobin.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupRoundRobin.hpp
@@ -31,75 +31,75 @@ class TORCH_API ProcessGroupRoundRobin final : public ProcessGroup {
       return std::string(ROUND_ROBIN_BACKEND_NAME);
   }
 
-  c10::intrusive_ptr<ProcessGroup::Work> broadcast(
+  c10::intrusive_ptr<Work> broadcast(
       std::vector<at::Tensor>& tensors,
       const BroadcastOptions& opts = BroadcastOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allreduce(
+  c10::intrusive_ptr<Work> allreduce(
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allreduce_coalesced(
+  c10::intrusive_ptr<Work> allreduce_coalesced(
       std::vector<at::Tensor>& tensors,
       const AllreduceCoalescedOptions& opts =
           AllreduceCoalescedOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> reduce(
+  c10::intrusive_ptr<Work> reduce(
       std::vector<at::Tensor>& tensors,
       const ReduceOptions& opts = ReduceOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allgather(
+  c10::intrusive_ptr<Work> allgather(
       std::vector<std::vector<at::Tensor>>& outputs,
       std::vector<at::Tensor>& inputs,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> _allgather_base(
+  c10::intrusive_ptr<Work> _allgather_base(
       at::Tensor& outputBuffer,
       at::Tensor& inputBuffer,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allgather_coalesced(
+  c10::intrusive_ptr<Work> allgather_coalesced(
       std::vector<std::vector<at::Tensor>>& outputTensorLists,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> gather(
+  c10::intrusive_ptr<Work> gather(
       std::vector<std::vector<at::Tensor>>& outputs,
       std::vector<at::Tensor>& inputs,
       const GatherOptions& opts = GatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> scatter(
+  c10::intrusive_ptr<Work> scatter(
       std::vector<at::Tensor>& outputs,
       std::vector<std::vector<at::Tensor>>& inputs,
       const ScatterOptions& opts = ScatterOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(
+  c10::intrusive_ptr<Work> reduce_scatter(
       std::vector<at::Tensor>& outputs,
       std::vector<std::vector<at::Tensor>>& inputs,
       const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> alltoall_base(
+  c10::intrusive_ptr<Work> alltoall_base(
       at::Tensor& outputTensor,
       at::Tensor& inputTensor,
       std::vector<int64_t>& outputSplitSizes,
       std::vector<int64_t>& inputSplitSizes,
       const AllToAllOptions& opts = AllToAllOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> send(
+  c10::intrusive_ptr<Work> send(
       std::vector<at::Tensor>& tensors,
       int dstRank,
       int tag) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> recv(
+  c10::intrusive_ptr<Work> recv(
       std::vector<at::Tensor>& tensors,
       int srcRank,
       int tag) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> recvAnysource(
+  c10::intrusive_ptr<Work> recvAnysource(
       std::vector<at::Tensor>& tensors,
       int tag) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> barrier(
+  c10::intrusive_ptr<Work> barrier(
       const BarrierOptions& opts = BarrierOptions()) override;
 
  private:
diff --git a/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp b/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
index fcae4e08e637..d2a5a8c9a34f 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
@@ -13,18 +13,6 @@ namespace c10d {
 namespace {
 constexpr int64_t kBusyWaitMillis = 10;
 
-const std::map<c10::DeviceType, ucs_memory_type_t> ucs_mtype_map = {
-    {c10::kCPU, UCS_MEMORY_TYPE_HOST},
-    {c10::kCUDA, UCS_MEMORY_TYPE_CUDA},
-};
-
-ucs_memory_type_t to_ucs_memType(c10::DeviceType _c10_type) {
-  if (ucs_mtype_map.find(_c10_type) != ucs_mtype_map.end())
-    return ucs_mtype_map.at(_c10_type);
-  else
-    return UCS_MEMORY_TYPE_UNKNOWN;
-}
-
 const std::map<c10::DeviceType, ucc_memory_type_t> ucc_mtype_map = {
     {c10::kCPU, UCC_MEMORY_TYPE_HOST},
     {c10::kCUDA, UCC_MEMORY_TYPE_CUDA},
@@ -97,7 +85,6 @@ ucc_reduction_op_t to_ucc_reduceOp(
 struct torch_ucc_config_t {
   c10::once_flag flag;
   std::array<bool, 32> blocking_wait;
-  bool enable_profiling;
   bool enable_comms_logger;
   bool use_future;
   // Sharing UCC communicator among multiple PGs to save resource.
@@ -163,7 +150,6 @@ std::vector<OpType> parse_blocking_wait(std::string op_list_string) {
 void read_config() {
   // default configuration
   torch_ucc_config.blocking_wait.fill(false);
-  torch_ucc_config.enable_profiling = false;
   torch_ucc_config.use_future = true;
   torch_ucc_config.shared_comm = false;
   torch_ucc_config.use_allgatherv = false;
@@ -186,8 +172,6 @@ void read_config() {
 
   torch_ucc_config.use_future =
       std::stoi(torch_ucc_envs_map.at("TORCH_UCC_USE_FUTURE"));
-  torch_ucc_config.enable_profiling =
-      std::stoi(torch_ucc_envs_map.at("TORCH_UCC_PROFILING_ENABLE"));
   torch_ucc_config.shared_comm =
       std::stoi(torch_ucc_envs_map.at("TORCH_UCC_SHARED_COMM"));
   torch_ucc_config.use_allgatherv =
@@ -283,9 +267,9 @@ bool ProcessGroupUCC::WorkUCC::wait(std::chrono::milliseconds /* unused */) {
   setAndThrowException();
   // manually call profiling end callbacks if they are set,
   // since progress thread does not own WorkUCC
-  if (ProcessGroup::Work::recordFunctionEndCallback_) {
-    ProcessGroup::Work::recordFunctionEndCallback_();
-    ProcessGroup::Work::recordFunctionEndCallback_ = nullptr;
+  if (Work::recordFunctionEndCallback_) {
+    Work::recordFunctionEndCallback_();
+    Work::recordFunctionEndCallback_ = nullptr;
   }
   return true;
 }
@@ -294,6 +278,14 @@ c10::intrusive_ptr<c10::ivalue::Future> ProcessGroupUCC::WorkUCC::getFuture() {
   return future_;
 }
 
+int ProcessGroupUCC::WorkUCC::sourceRank() const {
+  if (opType_ != OpType::RECV && opType_ != OpType::RECVANYSOURCE) {
+    // Throw an error
+    return Work::sourceRank();
+  }
+  return sourceRank_;
+}
+
 std::vector<at::Tensor> ProcessGroupUCC::WorkUCC::result() {
   return *outputs_;
 }
@@ -327,7 +319,6 @@ Comm::Comm(
     bool is_health_check)
     : logger(logger_),
       oob(oob_),
-      ucx_comm(oob->size, logger),
       ucc_comm(oob, logger),
       finalize_phase(
           is_health_check ? TORCH_UCC_HEALTH_CHECK : TORCH_UCC_FINALIZE),
@@ -364,7 +355,7 @@ std::shared_ptr<Comm> Comm::get_comm(
   static uint32_t comm_id;
 
   std::lock_guard<std::mutex> lock(m);
-  id = (comm_id % TORCH_UCX_MAX_COMM);
+  id = comm_id;
 
   std::string group_id = "group_id";
   if (is_health_check) {
@@ -419,126 +410,6 @@ std::shared_ptr<Comm> Comm::get_comm(
   }
 }
 
-void Comm::ucx_connect_eps(
-    std::vector<ucp_ep_h>& eps,
-    std::shared_ptr<torch_ucc_oob_coll_info_t> oob) {
-  ucp_address_t* local_addr;
-  size_t local_addr_len;
-  std::vector<uint8_t> peer_addr;
-
-  TORCH_UCX_CHECK(
-      ucp_worker_get_address(ucx_comm.worker, &local_addr, &local_addr_len),
-      "failed to get worker address");
-
-  std::vector<uint8_t> val = std::vector<uint8_t>(
-      reinterpret_cast<uint8_t*>(local_addr),
-      reinterpret_cast<uint8_t*>(local_addr) + local_addr_len);
-  oob->store->set(oob->getKey("wa" + std::to_string(oob->rank)), val);
-  ucp_worker_release_address(ucx_comm.worker, local_addr);
-  eps.resize(oob->size);
-  for (int i = 0; i < oob->size; i++) {
-    peer_addr = oob->store->get(oob->getKey("wa" + std::to_string(i)));
-    ucp_ep_params_t ep_params;
-    ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS;
-    ep_params.address = reinterpret_cast<ucp_address_t*>(peer_addr.data());
-    TORCH_UCX_CHECK(
-        ucp_ep_create(ucx_comm.worker, &ep_params, &(eps[i])),
-        c10::str("failed to create endpoint with rank ", i));
-  }
-}
-
-void Comm::ucx_disconnect_eps(
-    std::vector<ucp_ep_h>& eps,
-    std::shared_ptr<torch_ucc_oob_coll_info_t> oob) {
-  ucs_status_t st;
-
-  for (ucp_ep_h& ep : eps) {
-    ucs_status_ptr_t close_req = ucp_ep_close_nb(ep, UCP_EP_CLOSE_MODE_FLUSH);
-    if (UCS_PTR_IS_ERR(close_req)) {
-      TORCH_UCC_LOG_ERROR(
-          finalize_phase, "failed to close endpoint, ignore and continue...");
-      return;
-    }
-    if (UCS_PTR_IS_PTR(close_req)) {
-      do {
-        ucp_worker_progress(ucx_comm.worker);
-        st = ucp_request_check_status(close_req);
-      } while (st != UCS_OK);
-      ucp_request_free(close_req);
-    }
-  }
-  if (!eps.size()) {
-    return;
-  }
-  try {
-    auto sz = (size_t)oob->store->add(oob->getKey("epclosed"), 1);
-    while (sz != eps.size()) {
-      ucp_worker_progress(ucx_comm.worker);
-      std::this_thread::sleep_for(std::chrono::milliseconds(kBusyWaitMillis));
-      sz = (size_t)oob->store->add(oob->getKey("epclosed"), 0);
-    }
-  } catch (std::exception& ex) {
-    LOG(ERROR) << "(disconnect_eps) Caught error in Store Operation .. "
-               << "[" << ex.what() << "]";
-  }
-}
-
-ucc_coll_req_h Comm::send_nb(
-    ucp_ep_h ep,
-    void* data,
-    ucs_memory_type_t mtype,
-    size_t size,
-    ucp_tag_t ucp_tag) {
-  ucs_status_ptr_t st;
-  ucp_request_param_t params;
-  params.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK |
-      UCP_OP_ATTR_FIELD_DATATYPE | UCP_OP_ATTR_FIELD_MEMORY_TYPE;
-  params.datatype = ucp_dt_make_contig(size);
-  params.memory_type = mtype;
-  params.cb.send = [](void* request, ucs_status_t status, void* user_data) {
-    static_cast<ucc_coll_req_h>(request)->status = UCC_OK;
-  };
-  st = ucp_tag_send_nbx(ep, data, 1, ucp_tag, &params);
-  if (UCS_PTR_IS_ERR(st)) {
-    TORCH_UCC_LOG_ERROR(
-        TORCH_UCC_COLL_POST,
-        c10::str(
-            "failed to send message: ", ucs_status_string(UCS_PTR_STATUS(st))));
-    throw std::runtime_error(ucs_status_string(UCS_PTR_STATUS(st)));
-  }
-  return reinterpret_cast<ucc_coll_req_h>(st);
-}
-
-ucc_coll_req_h Comm::recv_nb(
-    void* data,
-    ucs_memory_type_t mtype,
-    size_t size,
-    ucp_tag_t ucp_tag,
-    ucp_tag_t ucp_tag_mask) {
-  ucs_status_ptr_t st;
-  ucp_request_param_t params;
-  params.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK |
-      UCP_OP_ATTR_FIELD_DATATYPE | UCP_OP_ATTR_FIELD_MEMORY_TYPE;
-  params.datatype = ucp_dt_make_contig(size);
-  params.cb.recv = [](void* request,
-                      ucs_status_t status,
-                      const ucp_tag_recv_info_t* info,
-                      void* user_data) {
-    static_cast<ucc_coll_req_h>(request)->status = UCC_OK;
-  };
-  params.memory_type = mtype;
-  st = ucp_tag_recv_nbx(
-      ucx_comm.worker, data, 1, ucp_tag, ucp_tag_mask, &params);
-  if (UCS_PTR_IS_ERR(st)) {
-    TORCH_UCC_LOG_ERROR(
-        TORCH_UCC_COLL_POST,
-        c10::str(
-            "failed to recv message: ", ucs_status_string(UCS_PTR_STATUS(st))));
-    throw std::runtime_error(ucs_status_string(UCS_PTR_STATUS(st)));
-  }
-  return reinterpret_cast<ucc_coll_req_h>(st);
-}
-
 void Comm::ucc_create_team(
     ucc_team_h& team,
     std::shared_ptr<torch_ucc_oob_coll_info_t> oob) {
@@ -582,34 +453,6 @@ void Comm::ucc_destroy_team(ucc_team_h& team) {
   lock.unlock();
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> Comm::enqueue_p2p(
-    OpType opType,
-    ucc_coll_req_h request,
-    const char* prof_title) {
-  auto work =
-      c10::make_intrusive<ProcessGroupUCC::WorkUCC>(opType, prof_title, logger);
-  if (torch_ucc_config.use_future) {
-    work->future_ = c10::make_intrusive<at::ivalue::Future>(
-        c10::ListType::create(c10::TensorType::get()));
-  }
-  if (request == nullptr) {
-    // p2p2 request completed immediately don't save it to progress queue
-    // and mark future completed immediately
-    if (torch_ucc_config.use_future) {
-      work->future_->markCompleted(c10::IValue(std::vector<at::Tensor>()));
-    }
-    return work;
-  }
-  auto entry =
-      std::make_shared<ProcessGroupUCC::ProgressEntry>(&ucx_comm, request);
-  work->entry_ = entry;
-  std::unique_lock<std::mutex> lock(mutex);
-  progress_queue.push_back(entry);
-  lock.unlock();
-  queue_produce_cv.notify_one();
-  return work;
-}
-
 void Comm::enqueue_collective(
     std::unique_ptr<ProcessGroupUCC::WorkData> data,
     c10::intrusive_ptr<ProcessGroupUCC::WorkUCC> work,
@@ -688,7 +531,6 @@ void Comm::progress_loop() {
     try {
       while (work->request_->status > 0) {
         ucc_comm.progress();
-        ucx_comm.progress();
       }
       if (work->request_->status < 0) {
         eptr = std::make_exception_ptr(
@@ -724,7 +566,7 @@ ProcessGroupUCC::ProcessGroupUCC(
   comm = nullptr;
   cuda_ee = nullptr;
   static uint32_t id = 0;
-  uint32_t pg_id = (id++ % TORCH_UCX_MAX_COMM);
+  uint32_t pg_id = id++;
 
   logger = c10::make_intrusive<ProcessGroupUCCLogger>(
       c10::str("[Rank ", rank_, "]", "[ProcessGroupUCC-", pg_id, "]"),
@@ -769,20 +611,10 @@ ProcessGroupUCC::~ProcessGroupUCC() {
     comm->ucc_destroy_team(team);
     TORCH_UCC_LOG_INFO(
         TORCH_UCC_FINALIZE, "Successfully destroyed UCC library");
-    comm->ucx_disconnect_eps(eps, oob);
-    TORCH_UCC_LOG_INFO(
-        TORCH_UCC_FINALIZE, "Successfully destroyed UCX library");
     try {
       if (cuda_ee) {
         ucc_ee_destroy(cuda_ee);
       }
-      if ((size_t)oob->store->add(oob->getKey("ucc_pg_closed"), 1) ==
-          eps.size()) {
-        std::vector<uint8_t> val = {1};
-        oob->store->set(oob->getKey("ucc_pg_finished"), val);
-      } else {
-        oob->store->wait({oob->getKey("ucc_pg_finished")});
-      }
     } catch (std::exception& ex) {
       TORCH_UCC_LOG_INFO(
           TORCH_UCC_FINALIZE,
@@ -817,7 +649,6 @@ void ProcessGroupUCC::runHealthCheck() {
   struct HealthCheckData {
     std::mutex healthCheckMutex;
     std::condition_variable healthCheckCv;
-    bool ucxHealthCheckSuccess = false;
     bool uccHealthCheckSuccess = false;
     std::exception_ptr healthCheckException;
   } healthCheckData;
@@ -837,8 +668,6 @@ void ProcessGroupUCC::runHealthCheck() {
         oob->rank = this->oob->rank;
         oob->size = this->oob->size;
         oob->store = this->oob->store;
-
-        std::vector<ucp_ep_h> eps;
         ucc_team_h team = nullptr;
         uint32_t comm_id;
 #ifdef USE_CUDA
@@ -847,19 +676,6 @@ void ProcessGroupUCC::runHealthCheck() {
         }
 #endif
         auto comm = Comm::get_comm(comm_id, device, oob, logger, true);
-        comm->ucx_connect_eps(eps, oob);
-        comm->ucx_disconnect_eps(eps, oob);
-        TORCH_UCC_LOG_INFO(
-            TORCH_UCC_HEALTH_CHECK,
-            c10::str(
-                "UCX library health check succeed for device ",
-                c10::DeviceTypeName(device.type())));
-        // Mark ucx health check as complete.
-        if (is_last_device) {
-          std::lock_guard<std::mutex> lk(healthCheckData.healthCheckMutex);
-          healthCheckData.ucxHealthCheckSuccess = true;
-        }
-
         comm->ucc_create_team(team, oob);
         comm->ucc_destroy_team(team);
         TORCH_UCC_LOG_INFO(
@@ -898,18 +714,13 @@ void ProcessGroupUCC::runHealthCheck() {
           " msec for UCC health check to complete."));
   std::unique_lock<std::mutex> lock(healthCheckData.healthCheckMutex);
   healthCheckData.healthCheckCv.wait_for(lock, timeout_, [&healthCheckData]() {
-    return healthCheckData.ucxHealthCheckSuccess &&
-        healthCheckData.uccHealthCheckSuccess;
+    return healthCheckData.uccHealthCheckSuccess;
   });
 
   if (healthCheckData.healthCheckException) {
     std::rethrow_exception(healthCheckData.healthCheckException);
   }
   // If there is no exception, the likely culprit is a timeout/hang
-  TORCH_CHECK(
-      healthCheckData.ucxHealthCheckSuccess,
-      "ProcessGroupUCC: Health check failure: Failed to initialize UCX on rank ",
-      rank_);
   TORCH_CHECK(
       healthCheckData.uccHealthCheckSuccess,
       "ProcessGroupUCC: Health check failure: Failed to initialize UCC on rank ",
@@ -937,7 +748,7 @@ std::unique_ptr<at::cuda::CUDAEvent> ProcessGroupUCC::getPooledEvent() {
 #endif
 
 template <typename PreProcess, typename PostProcess>
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::collective_post(
+c10::intrusive_ptr<Work> ProcessGroupUCC::collective_post(
     OpType opType,
     PreProcess preproc,
     PostProcess postproc,
@@ -948,8 +759,12 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::collective_post(
     std::vector<at::Tensor>& outputTensors,
     const char* prof_title) {
   set_timeout(coll);
-  auto work = c10::make_intrusive<ProcessGroupUCC::WorkUCC>(
-      opType, torch_ucc_config.enable_profiling ? prof_title : nullptr, logger);
+  auto work =
+      c10::make_intrusive<ProcessGroupUCC::WorkUCC>(opType, prof_title, logger);
+
+  if (opType == OpType::RECV) {
+    work->sourceRank_ = coll.root;
+  }
 
   RECORD_COMMS_TRACE(
       logger->trace_generator,
@@ -1008,7 +823,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::collective_post(
   }
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::allgather(
+c10::intrusive_ptr<Work> ProcessGroupUCC::allgather(
     std::vector<std::vector<at::Tensor>>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const AllgatherOptions& /* unused */) {
@@ -1106,11 +921,11 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::allgather(
         tensor.device(),
         inputTensors,
         outputTensors[0],
-        "ucc:allgather");
+        "ucc:all_gather");
   }
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::_allgather_base(
+c10::intrusive_ptr<Work> ProcessGroupUCC::_allgather_base(
     at::Tensor& outputTensor,
     at::Tensor& inputTensor,
     const AllgatherOptions& opts) {
@@ -1150,7 +965,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::_allgather_base(
       "ucc:allgather_base");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::allreduce(
+c10::intrusive_ptr<Work> ProcessGroupUCC::allreduce(
     std::vector<at::Tensor>& tensors,
     const AllreduceOptions& opts) {
   check_tensor(tensors);
@@ -1184,14 +999,14 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::allreduce(
       "ucc:allreduce");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::allreduce_coalesced(
+c10::intrusive_ptr<Work> ProcessGroupUCC::allreduce_coalesced(
     std::vector<at::Tensor>& /* unused */,
     const AllreduceCoalescedOptions& /* unused */) {
   throw std::runtime_error(
       "ProcessGroupUCC does not support allreduce_coalesced");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::alltoall(
+c10::intrusive_ptr<Work> ProcessGroupUCC::alltoall(
     std::vector<at::Tensor>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const AllToAllOptions& /* unused */) {
@@ -1255,7 +1070,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::alltoall(
       "ucc:alltoall");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::alltoall_base(
+c10::intrusive_ptr<Work> ProcessGroupUCC::alltoall_base(
     at::Tensor& outputTensor,
     at::Tensor& inputTensor,
     std::vector<int64_t>& outputSplitSizes,
@@ -1333,8 +1148,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::alltoall_base(
       "ucc:alltoall");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::barrier(
-    const BarrierOptions& opts) {
+c10::intrusive_ptr<Work> ProcessGroupUCC::barrier(const BarrierOptions& opts) {
   c10::Device device = c10::Device(c10::DeviceType::CPU);
 #ifdef USE_CUDA
   auto numGPUs = c10::cuda::device_count();
@@ -1381,7 +1195,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::barrier(
       "ucc:barrier");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::broadcast(
+c10::intrusive_ptr<Work> ProcessGroupUCC::broadcast(
     std::vector<at::Tensor>& tensors,
     const BroadcastOptions& opts) {
   check_tensor(tensors);
@@ -1416,7 +1230,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::broadcast(
       "ucc:broadcast");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::gather(
+c10::intrusive_ptr<Work> ProcessGroupUCC::gather(
     std::vector<std::vector<at::Tensor>>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const GatherOptions& opts) {
@@ -1497,7 +1311,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::gather(
       "ucc:gather");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::reduce(
+c10::intrusive_ptr<Work> ProcessGroupUCC::reduce(
     std::vector<at::Tensor>& tensors,
     const ReduceOptions& opts) {
   check_tensor(tensors);
@@ -1532,7 +1346,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::reduce(
       "ucc:reduce");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::reduce_scatter(
+c10::intrusive_ptr<Work> ProcessGroupUCC::reduce_scatter(
     std::vector<at::Tensor>& outputTensors,
     std::vector<std::vector<at::Tensor>>& inputTensors,
     const ReduceScatterOptions& opts) {
@@ -1606,7 +1420,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::reduce_scatter(
       "ucc:reduce_scatter");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::scatter(
+c10::intrusive_ptr<Work> ProcessGroupUCC::scatter(
     std::vector<at::Tensor>& outputTensors,
     std::vector<std::vector<at::Tensor>>& inputTensors,
     const ScatterOptions& opts) {
@@ -1681,7 +1495,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::scatter(
       "ucc:scatter");
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::send(
+c10::intrusive_ptr<Work> ProcessGroupUCC::send(
     std::vector<at::Tensor>& tensors,
     int dstRank,
     int tag) {
@@ -1689,7 +1503,6 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::send(
   auto& tensor = tensors[0];
   initComm(tensor.device());
 
-#ifdef USE_ACTIVE_SETS
   WorkData* data = new WorkData();
   ucc_coll_args_t coll;
   coll.tag = tag;
@@ -1717,31 +1530,9 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::send(
       tensors,
       tensors,
       "ucc:send");
-#else
-  ucp_tag_t ucp_tag;
-  TORCH_UCX_MAKE_SEND_TAG(ucp_tag, tag, rank_, comm_id);
-  ucc_coll_req_h request = comm->send_nb(
-      eps[dstRank],
-      tensor.data_ptr(),
-      to_ucs_memType(tensor.device().type()),
-      tensor.numel() * tensor.element_size(),
-      ucp_tag);
-
-  auto work = comm->enqueue_p2p(OpType::SEND, request, "ucc:send");
-  // TODO: record src, dst ranks and tag
-  RECORD_COMMS_TRACE(
-      logger->trace_generator,
-      work,
-      OpType::SEND,
-      this->getRank(),
-      this->getSize(),
-      tensors,
-      tensors);
-  return work;
-#endif
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::recv(
+c10::intrusive_ptr<Work> ProcessGroupUCC::recv(
     std::vector<at::Tensor>& tensors,
     int srcRank,
     int tag) {
@@ -1749,7 +1540,6 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::recv(
   auto& tensor = tensors[0];
   initComm(tensor.device());
 
-#ifdef USE_ACTIVE_SETS
   WorkData* data = new WorkData();
   ucc_coll_args_t coll;
   coll.tag = tag;
@@ -1777,58 +1567,6 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::recv(
       tensors,
       tensors,
       "ucc:recv");
-#else
-  ucp_tag_t ucp_tag, ucp_tag_mask;
-  TORCH_UCX_MAKE_RECV_TAG(ucp_tag, ucp_tag_mask, tag, srcRank, comm_id);
-  ucc_coll_req_h request = comm->recv_nb(
-      tensor.data_ptr(),
-      to_ucs_memType(tensor.device().type()),
-      tensor.numel() * tensor.element_size(),
-      ucp_tag,
-      ucp_tag_mask);
-
-  auto work = comm->enqueue_p2p(OpType::RECV, request, "ucc:recv");
-  // TODO: record src, dst ranks and tag
-  RECORD_COMMS_TRACE(
-      logger->trace_generator,
-      work,
-      OpType::RECV,
-      this->getRank(),
-      this->getSize(),
-      tensors,
-      tensors);
-  return work;
-#endif
-}
-
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupUCC::recvAnysource(
-    std::vector<at::Tensor>& tensors,
-    int tag) {
-  check_tensor(tensors);
-  auto& tensor = tensors[0];
-  initComm(tensor.device());
-
-  ucp_tag_t ucp_tag, ucp_tag_mask;
-  TORCH_UCX_MAKE_RECV_TAG(
-      ucp_tag, ucp_tag_mask, tag, TORCH_UCX_ANY_SOURCE, comm_id);
-  ucc_coll_req_h request = comm->recv_nb(
-      tensor.data_ptr(),
-      to_ucs_memType(tensor.device().type()),
-      tensor.numel() * tensor.element_size(),
-      ucp_tag,
-      ucp_tag_mask);
-
-  auto work = comm->enqueue_p2p(OpType::RECVANYSOURCE, request, "ucc:recv");
-  // TODO: record dst rank and tag
-  RECORD_COMMS_TRACE(
-      logger->trace_generator,
-      work,
-      OpType::RECVANYSOURCE,
-      this->getRank(),
-      this->getSize(),
-      tensors,
-      tensors);
-  return work;
 }
 
 c10::intrusive_ptr<ProcessGroup> ProcessGroupUCC::createProcessGroupUCC(
@@ -1847,7 +1585,6 @@ void ProcessGroupUCC::initComm(c10::Device dev) {
     }
 #endif
     comm = Comm::get_comm(comm_id, dev, oob, logger);
-    comm->ucx_connect_eps(eps, oob);
     TORCH_UCC_LOG_INFO(TORCH_UCC_INIT, "Successfully initialized UCX library");
     comm->ucc_create_team(team, oob);
     TORCH_UCC_LOG_INFO(TORCH_UCC_INIT, "Successfully initialized UCC library");
diff --git a/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp b/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp
index 1209ea2324c9..624b003263b5 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp
@@ -24,47 +24,6 @@ namespace c10d {
 
 #define TORCH_UCC_DEVICE_NOT_SET -2
 
-#define TORCH_UCX_MAKE_P2P_TAG(_tag, _rank, _comm)       \
-  ((((uint64_t)(_tag)) << TORCH_UCX_TAG_BITS_OFFSET) |   \
-   (((uint64_t)(_rank)) << TORCH_UCX_RANK_BITS_OFFSET) | \
-   (((uint64_t)(_comm)) << TORCH_UCX_COMM_BITS_OFFSET))
-
-#define TORCH_UCX_MAKE_OOB_TAG(_tag, _rank, _comm)       \
-  ((((uint64_t)(_tag)) << TORCH_UCX_OOB_BITS_OFFSET) |   \
-   (((uint64_t)(_rank)) << TORCH_UCX_RANK_BITS_OFFSET) | \
-   (((uint64_t)(_rank)) << TORCH_UCX_COMM_BITS_OFFSET))
-
-#define TORCH_UCX_MAKE_SEND_TAG(_ucp_tag, _tag, _rank, _comm)      \
-  do {                                                             \
-    (_ucp_tag) = TORCH_UCX_MAKE_P2P_TAG((_tag), (_rank), (_comm)); \
-  } while (0)
-
-#define TORCH_UCX_ANY_SOURCE (TORCH_UCX_MAX_RANK - 1)
-#define TORCH_UCX_ANY_SOURCE_MASK (~TORCH_UCX_RANK_MASK)
-#define TORCH_UCX_SPECIFIC_SOURCE_MASK ((uint64_t)-1)
-
-#define TORCH_UCX_MAKE_RECV_TAG(_ucp_tag, _ucp_tag_mask, _tag, _rank, _comm) \
-  do {                                                                       \
-    (_ucp_tag) = TORCH_UCX_MAKE_P2P_TAG((_tag), (_rank), (_comm));           \
-    if ((_rank) == TORCH_UCX_ANY_SOURCE) {                                   \
-      (_ucp_tag_mask) = TORCH_UCX_ANY_SOURCE_MASK;                           \
-    } else {                                                                 \
-      (_ucp_tag_mask) = TORCH_UCX_SPECIFIC_SOURCE_MASK;                      \
-    }                                                                        \
-  } while (0)
-
-#define TORCH_UCX_MAKE_OOB_SEND_TAG(_ucp_tag, _tag, _rank, _comm)  \
-  do {                                                             \
-    (_ucp_tag) = TORCH_UCX_MAKE_OOB_TAG((_tag), (_rank), (_comm)); \
-  } while (0)
-
-#define TORCH_UCX_MAKE_OOB_RECV_TAG(                               \
-    _ucp_tag, _ucp_tag_mask, _tag, _rank, _comm)                   \
-  do {                                                             \
-    (_ucp_tag) = TORCH_UCX_MAKE_OOB_TAG((_tag), (_rank), (_comm)); \
-    (_ucp_tag_mask) = (uint64_t)-1;                                \
-  } while (0)
-
 #ifdef USE_CUDA
 #define SAVE_TENSORS(_TENSORS, _DATA)                       \
   do {                                                      \
@@ -84,8 +43,6 @@ namespace c10d {
 
 constexpr const char* UCC_BACKEND_NAME = "ucc";
 
-enum torch_ucx_tag_type_t { TORCH_UCX_P2P_TAG, TORCH_UCX_OOB_TAG };
-
 struct event_pool_t {
 #ifdef USE_CUDA
   std::queue<std::unique_ptr<at::cuda::CUDAEvent>> event_pool;
@@ -153,18 +110,18 @@ class TORCH_API ProcessGroupUCC : public ProcessGroup {
     std::exception_ptr eptr_;
   };
 
-  class WorkUCC : public ProcessGroup::Work {
+  class WorkUCC : public Work {
     friend class ProcessGroupUCC;
     friend class Comm;
 
    public:
     WorkUCC(OpType opType, const char* prof_title)
-        : ProcessGroup::Work(-1, opType, prof_title) {}
+        : Work(-1, opType, prof_title) {}
     WorkUCC(
         OpType opType,
         const char* prof_title,
         const c10::intrusive_ptr<ProcessGroupUCCLogger>& logger)
-        : ProcessGroup::Work(-1, opType, prof_title), logger_(logger) {}
+        : Work(-1, opType, prof_title), logger_(logger) {}
     ~WorkUCC();
     void setException();
     void setAndThrowException();
@@ -173,10 +130,12 @@ class TORCH_API ProcessGroupUCC : public ProcessGroup {
     bool wait(std::chrono::milliseconds timeout = kUnsetTimeout) override;
     c10::intrusive_ptr<c10::ivalue::Future> getFuture() override;
     std::vector<at::Tensor> result() override;
+    int sourceRank() const override;
 #ifdef USE_CUDA
     std::unique_ptr<at::cuda::CUDAEvent> fence = nullptr;
     event_pool_t* ep = nullptr;
 #endif
+    int sourceRank_;
    protected:
     std::shared_ptr<ProgressEntry> entry_;
     c10::intrusive_ptr<ProcessGroupUCCLogger> logger_;
@@ -215,7 +174,7 @@ class TORCH_API ProcessGroupUCC : public ProcessGroup {
   void runHealthCheck();
 
   template <typename PreProcess, typename PostProcess>
-  c10::intrusive_ptr<ProcessGroup::Work> collective_post(
+  c10::intrusive_ptr<Work> collective_post(
       OpType opType,
       PreProcess preproc,
       PostProcess postproc,
@@ -226,77 +185,73 @@ class TORCH_API ProcessGroupUCC : public ProcessGroup {
       std::vector<at::Tensor>& outputTensors,
       const char* prof_title);
 
-  c10::intrusive_ptr<ProcessGroup::Work> broadcast(
+  c10::intrusive_ptr<Work> broadcast(
       std::vector<at::Tensor>& data,
       const BroadcastOptions& opts = BroadcastOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allreduce(
+  c10::intrusive_ptr<Work> allreduce(
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allreduce_coalesced(
+  c10::intrusive_ptr<Work> allreduce_coalesced(
       std::vector<at::Tensor>& tensors,
       const AllreduceCoalescedOptions& opts =
           AllreduceCoalescedOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> reduce(
+  c10::intrusive_ptr<Work> reduce(
       std::vector<at::Tensor>& tensors,
       const ReduceOptions& opts = ReduceOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allgather(
+  c10::intrusive_ptr<Work> allgather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> _allgather_base(
+  c10::intrusive_ptr<Work> _allgather_base(
       at::Tensor& outputBuffer,
       at::Tensor& inputBuffer,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> barrier(
+  c10::intrusive_ptr<Work> barrier(
       const BarrierOptions& opts = BarrierOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> gather(
+  c10::intrusive_ptr<Work> gather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const GatherOptions& opts = GatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> scatter(
+  c10::intrusive_ptr<Work> scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ScatterOptions& opts = ScatterOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(
+  c10::intrusive_ptr<Work> reduce_scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> alltoall_base(
+  c10::intrusive_ptr<Work> alltoall_base(
       at::Tensor& outputTensor,
       at::Tensor& inputTensor,
       std::vector<int64_t>& outputSplitSizes,
       std::vector<int64_t>& inputSplitSizes,
       const AllToAllOptions& opts = AllToAllOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> alltoall(
+  c10::intrusive_ptr<Work> alltoall(
       std::vector<at::Tensor>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const AllToAllOptions& opts = AllToAllOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> send(
+  c10::intrusive_ptr<Work> send(
       std::vector<at::Tensor>& tensors,
       int dstRank,
       int tag) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> recv(
+  c10::intrusive_ptr<Work> recv(
       std::vector<at::Tensor>& tensors,
       int srcRank,
       int tag) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> recvAnysource(
-      std::vector<at::Tensor>& tensors,
-      int tag) override;
-
   static c10::intrusive_ptr<ProcessGroup> createProcessGroupUCC(
       const c10::intrusive_ptr<::c10d::Store>& store,
       int rank,
@@ -308,7 +263,6 @@ class TORCH_API ProcessGroupUCC : public ProcessGroup {
   std::shared_ptr<torch_ucc_oob_coll_info_t> oob;
   std::shared_ptr<Comm> comm = {nullptr};
   uint32_t comm_id;
-  std::vector<ucp_ep_h> eps;
   ucc_team_h team{nullptr};
   ucc_ee_h cuda_ee{nullptr};
 #ifdef USE_CUDA
@@ -321,7 +275,6 @@ class TORCH_API ProcessGroupUCC : public ProcessGroup {
 class Comm {
   c10::intrusive_ptr<ProcessGroupUCCLogger> logger;
   std::shared_ptr<torch_ucc_oob_coll_info_t> oob;
-  CommUCX ucx_comm;
   CommUCC ucc_comm;
   std::mutex mutex;
   std::thread progress_thread;
@@ -342,23 +295,13 @@ class Comm {
 
   ~Comm();
 
-  // Connects UCX end points.
-  void ucx_connect_eps(
-      std::vector<ucp_ep_h>& eps,
-      std::shared_ptr<torch_ucc_oob_coll_info_t> oob);
-
-  // Disconnects UCX end points.
-  void ucx_disconnect_eps(
-      std::vector<ucp_ep_h>& eps,
-      std::shared_ptr<torch_ucc_oob_coll_info_t> oob);
-
   void ucc_create_team(
       ucc_team_h& team,
       std::shared_ptr<torch_ucc_oob_coll_info_t> oob);
 
   void ucc_destroy_team(ucc_team_h& team);
 
-  c10::intrusive_ptr<ProcessGroup::Work> enqueue_p2p(
+  c10::intrusive_ptr<Work> enqueue_p2p(
       OpType opType,
       ucc_coll_req_h request,
       const char* prof_title);
@@ -386,20 +329,6 @@ class Comm {
       bool is_health_check = false);
 
   void progress_loop();
-
-  ucc_coll_req_h send_nb(
-      ucp_ep_h ep,
-      void* data,
-      ucs_memory_type_t mtype,
-      size_t size,
-      ucp_tag_t ucp_tag);
-
-  ucc_coll_req_h recv_nb(
-      void* data,
-      ucs_memory_type_t mtype,
-      size_t size,
-      ucp_tag_t ucp_tag,
-      ucp_tag_t ucp_tag_mask);
 };
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
index e7463304974f..e15b49dc6920 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
@@ -271,21 +271,21 @@ const std::string ProcessGroupWrapper::getBackendName() const {
   return pg_->getBackendName();
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::broadcast(
+c10::intrusive_ptr<Work> ProcessGroupWrapper::broadcast(
     std::vector<at::Tensor>& data,
     const BroadcastOptions& opts) {
   runCollectiveChecks(OpType::BROADCAST, data);
   return pg_->broadcast(data, opts);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::allreduce(
+c10::intrusive_ptr<Work> ProcessGroupWrapper::allreduce(
     std::vector<at::Tensor>& data,
     const AllreduceOptions& opts) {
   runCollectiveChecks(OpType::ALLREDUCE, data);
   return pg_->allreduce(data, opts);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::allreduce_coalesced(
+c10::intrusive_ptr<Work> ProcessGroupWrapper::allreduce_coalesced(
     std::vector<at::Tensor>& tensors,
     const AllreduceCoalescedOptions& opts) {
   // NOTE: We don't enforce shape checking for allreduce_coalesced because
@@ -296,14 +296,14 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::allreduce_coalesced(
   return pg_->allreduce_coalesced(tensors, opts);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::reduce(
+c10::intrusive_ptr<Work> ProcessGroupWrapper::reduce(
     std::vector<at::Tensor>& tensors,
     const ReduceOptions& opts) {
   runCollectiveChecks(OpType::REDUCE, tensors);
   return pg_->reduce(tensors, opts);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::allgather(
+c10::intrusive_ptr<Work> ProcessGroupWrapper::allgather(
     std::vector<std::vector<at::Tensor>>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const AllgatherOptions& opts) {
@@ -311,7 +311,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::allgather(
   return pg_->allgather(outputTensors, inputTensors, opts);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::_allgather_base(
+c10::intrusive_ptr<Work> ProcessGroupWrapper::_allgather_base(
     at::Tensor& outputBuffer,
     at::Tensor& inputBuffer,
     const AllgatherOptions& opts) {
@@ -320,7 +320,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::_allgather_base(
   return pg_->_allgather_base(outputBuffer, inputBuffer, opts);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::allgather_coalesced(
+c10::intrusive_ptr<Work> ProcessGroupWrapper::allgather_coalesced(
     std::vector<std::vector<at::Tensor>>& outputTensorLists,
     std::vector<at::Tensor>& inputTensors,
     const AllgatherOptions& opts) {
@@ -332,7 +332,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::allgather_coalesced(
   return pg_->allgather_coalesced(outputTensorLists, inputTensors, opts);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::gather(
+c10::intrusive_ptr<Work> ProcessGroupWrapper::gather(
     std::vector<std::vector<at::Tensor>>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const GatherOptions& opts) {
@@ -340,7 +340,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::gather(
   return pg_->gather(outputTensors, inputTensors, opts);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::scatter(
+c10::intrusive_ptr<Work> ProcessGroupWrapper::scatter(
     std::vector<at::Tensor>& outputTensors,
     std::vector<std::vector<at::Tensor>>& inputTensors,
     const ScatterOptions& opts) {
@@ -348,7 +348,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::scatter(
   return pg_->scatter(outputTensors, inputTensors, opts);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::reduce_scatter(
+c10::intrusive_ptr<Work> ProcessGroupWrapper::reduce_scatter(
     std::vector<at::Tensor>& outputTensors,
     std::vector<std::vector<at::Tensor>>& inputTensors,
     const ReduceScatterOptions& opts) {
@@ -356,7 +356,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::reduce_scatter(
   return pg_->reduce_scatter(outputTensors, inputTensors, opts);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::alltoall_base(
+c10::intrusive_ptr<Work> ProcessGroupWrapper::alltoall_base(
     at::Tensor& outputTensor,
     at::Tensor& inputTensor,
     std::vector<int64_t>& outputSplitSizes,
@@ -368,7 +368,7 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::alltoall_base(
       outputTensor, inputTensor, outputSplitSizes, inputSplitSizes, opts);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::alltoall(
+c10::intrusive_ptr<Work> ProcessGroupWrapper::alltoall(
     std::vector<at::Tensor>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const AllToAllOptions& opts) {
@@ -395,37 +395,36 @@ uint64_t ProcessGroupWrapper::getSequenceNumberForGroup() {
   return pg_->getSequenceNumberForGroup();
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::send(
+c10::intrusive_ptr<Work> ProcessGroupWrapper::send(
     std::vector<at::Tensor>& tensors,
     int dstRank,
     int tag) {
   return pg_->send(tensors, dstRank, tag);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::recv(
+c10::intrusive_ptr<Work> ProcessGroupWrapper::recv(
     std::vector<at::Tensor>& tensors,
     int srcRank,
     int tag) {
   return pg_->recv(tensors, srcRank, tag);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::recvAnysource(
+c10::intrusive_ptr<Work> ProcessGroupWrapper::recvAnysource(
     std::vector<at::Tensor>& tensors,
     int tag) {
   return pg_->recvAnysource(tensors, tag);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::barrier(
+c10::intrusive_ptr<Work> ProcessGroupWrapper::barrier(
     const BarrierOptions& opts) {
   runCollectiveChecks(OpType::BARRIER, {});
   return pg_->barrier(opts);
 }
 
-c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupWrapper::
-    _reduce_scatter_base(
-        at::Tensor& outputBuffer,
-        at::Tensor& inputBuffer,
-        const ReduceScatterOptions& opts) {
+c10::intrusive_ptr<Work> ProcessGroupWrapper::_reduce_scatter_base(
+    at::Tensor& outputBuffer,
+    at::Tensor& inputBuffer,
+    const ReduceScatterOptions& opts) {
   runCollectiveChecks(
       OpType::_REDUCE_SCATTER_BASE, {inputBuffer, outputBuffer});
   return pg_->_reduce_scatter_base(outputBuffer, inputBuffer, opts);
diff --git a/torch/csrc/distributed/c10d/ProcessGroupWrapper.hpp b/torch/csrc/distributed/c10d/ProcessGroupWrapper.hpp
index 62ec553ff3f4..0abac96d00fc 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupWrapper.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupWrapper.hpp
@@ -17,29 +17,29 @@ class TORCH_API ProcessGroupWrapper : public ProcessGroup {
 
   const std::string getBackendName() const override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> broadcast(
+  c10::intrusive_ptr<Work> broadcast(
       std::vector<at::Tensor>& data,
       const BroadcastOptions& opts = BroadcastOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allreduce(
+  c10::intrusive_ptr<Work> allreduce(
       std::vector<at::Tensor>& data,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allreduce_coalesced(
+  c10::intrusive_ptr<Work> allreduce_coalesced(
       std::vector<at::Tensor>& tensors,
       const AllreduceCoalescedOptions& opts =
           AllreduceCoalescedOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> reduce(
+  c10::intrusive_ptr<Work> reduce(
       std::vector<at::Tensor>& tensors,
       const ReduceOptions& opts = ReduceOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> allgather(
+  c10::intrusive_ptr<Work> allgather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> _allgather_base(
+  c10::intrusive_ptr<Work> _allgather_base(
       at::Tensor& outputBuffer,
       at::Tensor& inputBuffer,
       const AllgatherOptions& opts = AllgatherOptions()) override;
@@ -48,34 +48,34 @@ class TORCH_API ProcessGroupWrapper : public ProcessGroup {
   // * do not add dependencies on this function,
   // * do not implement it in your ProcessGroup, implement _allgather_base
   //   instead.
-  c10::intrusive_ptr<ProcessGroup::Work> allgather_coalesced(
+  c10::intrusive_ptr<Work> allgather_coalesced(
       std::vector<std::vector<at::Tensor>>& outputTensorLists,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> gather(
+  c10::intrusive_ptr<Work> gather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const GatherOptions& opts = GatherOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> scatter(
+  c10::intrusive_ptr<Work> scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ScatterOptions& opts = ScatterOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(
+  c10::intrusive_ptr<Work> reduce_scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> alltoall_base(
+  c10::intrusive_ptr<Work> alltoall_base(
       at::Tensor& outputTensor,
       at::Tensor& inputTensor,
       std::vector<int64_t>& outputSplitSizes,
       std::vector<int64_t>& inputSplitSizes,
       const AllToAllOptions& opts = AllToAllOptions()) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> alltoall(
+  c10::intrusive_ptr<Work> alltoall(
       std::vector<at::Tensor>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const AllToAllOptions& opts = AllToAllOptions()) override;
@@ -94,24 +94,24 @@ class TORCH_API ProcessGroupWrapper : public ProcessGroup {
   // may indicate that there is some sort of collective desynchronization.
   uint64_t getSequenceNumberForGroup() override; // just call underlying
 
-  c10::intrusive_ptr<ProcessGroup::Work> send(
+  c10::intrusive_ptr<Work> send(
       std::vector<at::Tensor>& tensors,
       int dstRank,
       int tag) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> recv(
+  c10::intrusive_ptr<Work> recv(
       std::vector<at::Tensor>& tensors,
       int srcRank,
       int tag) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> recvAnysource(
+  c10::intrusive_ptr<Work> recvAnysource(
       std::vector<at::Tensor>& tensors,
       int tag) override;
 
-  c10::intrusive_ptr<ProcessGroup::Work> barrier(
+  c10::intrusive_ptr<Work> barrier(
       const BarrierOptions& opts = BarrierOptions()) override;
 
-    c10::intrusive_ptr<ProcessGroup::Work> _reduce_scatter_base(
+    c10::intrusive_ptr<Work> _reduce_scatter_base(
       at::Tensor& outputBuffer,
       at::Tensor& inputBuffer,
       const ReduceScatterOptions& opts) override;
diff --git a/torch/csrc/distributed/c10d/PyProcessGroup.hpp b/torch/csrc/distributed/c10d/PyProcessGroup.hpp
index 22612aee12dc..c20efdd23f04 100644
--- a/torch/csrc/distributed/c10d/PyProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/PyProcessGroup.hpp
@@ -12,14 +12,14 @@ class PyProcessGroup : public ProcessGroup {
  public:
   // PyWork is a pybind11 trampoline class to allow a Python
   // class to inherit from torch.distributed.Work
-  class PyWork : public ProcessGroup::Work {
+  class PyWork : public Work {
    public:
     PyWork() = default;
 
     bool wait(std::chrono::milliseconds timeout = kNoTimeout) override {
       PYBIND11_OVERRIDE(
           bool, /* Return type */
-          ProcessGroup::Work, /* Parent class */
+          Work, /* Parent class */
           wait, /* Name of function in C++ */
           timeout);
     }
@@ -29,7 +29,7 @@ class PyProcessGroup : public ProcessGroup {
         // 1. We have to >MANUALLY< unwrap the PyFutureWrapper and
         // 2. The python name is get_future
         pybind11::gil_scoped_acquire gil;
-        auto override = pybind11::get_override(static_cast<const ProcessGroup::Work *>(this), "get_future");
+        auto override = pybind11::get_override(static_cast<const Work *>(this), "get_future");
 
         if (override) {
             py::object o = override();
@@ -51,12 +51,12 @@ class PyProcessGroup : public ProcessGroup {
     );
   }
 
-  c10::intrusive_ptr<ProcessGroup::Work> allgather(
+  c10::intrusive_ptr<Work> allgather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& opts = AllgatherOptions()) override {
     PYBIND11_OVERRIDE(
-        c10::intrusive_ptr<ProcessGroup::Work>, /* Return type */
+        c10::intrusive_ptr<Work>, /* Return type */
         ProcessGroup, /* Parent class */
         allgather, /* Name of function in C++ */
         outputTensors,
@@ -64,43 +64,43 @@ class PyProcessGroup : public ProcessGroup {
         opts);
   }
 
-  c10::intrusive_ptr<ProcessGroup::Work> allreduce(
+  c10::intrusive_ptr<Work> allreduce(
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override {
     PYBIND11_OVERRIDE(
-        c10::intrusive_ptr<ProcessGroup::Work>, /* Return type */
+        c10::intrusive_ptr<Work>, /* Return type */
         ProcessGroup, /* Parent class */
         allreduce, /* Name of function in C++ */
         tensors,
         opts);
   }
 
-  c10::intrusive_ptr<ProcessGroup::Work> barrier(
+  c10::intrusive_ptr<Work> barrier(
       const BarrierOptions& opts = BarrierOptions()) {
     PYBIND11_OVERRIDE(
-        c10::intrusive_ptr<ProcessGroup::Work>, /* Return type */
+        c10::intrusive_ptr<Work>, /* Return type */
         ProcessGroup, /* Parent class */
         barrier, /* Name of function in C++ */
         opts);
   }
 
-  c10::intrusive_ptr<ProcessGroup::Work> broadcast(
+  c10::intrusive_ptr<Work> broadcast(
       std::vector<at::Tensor>& tensors,
       const BroadcastOptions& opts = BroadcastOptions()) override {
     PYBIND11_OVERRIDE(
-        c10::intrusive_ptr<ProcessGroup::Work>, /* Return type */
+        c10::intrusive_ptr<Work>, /* Return type */
         ProcessGroup, /* Parent class */
         broadcast, /* Name of function in C++ */
         tensors,
         opts);
   }
 
-  c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(
+  c10::intrusive_ptr<Work> reduce_scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ReduceScatterOptions& opts = ReduceScatterOptions()) override {
     PYBIND11_OVERRIDE(
-        c10::intrusive_ptr<ProcessGroup::Work>, /* Return type */
+        c10::intrusive_ptr<Work>, /* Return type */
         ProcessGroup, /* Parent class */
         reduce_scatter, /* Name of function in C++ */
         outputTensors,
@@ -108,12 +108,12 @@ class PyProcessGroup : public ProcessGroup {
         opts);
   }
 
-  c10::intrusive_ptr<ProcessGroup::Work> send(
+  c10::intrusive_ptr<Work> send(
       std::vector<at::Tensor>& tensors,
       int dstRank,
       int tag) override {
     PYBIND11_OVERRIDE(
-        c10::intrusive_ptr<ProcessGroup::Work>, /* Return type */
+        c10::intrusive_ptr<Work>, /* Return type */
         ProcessGroup, /* Parent class */
         send, /* Name of function in C++ */
         tensors,
@@ -121,12 +121,12 @@ class PyProcessGroup : public ProcessGroup {
         tag);
   }
 
-  c10::intrusive_ptr<ProcessGroup::Work> recv(
+  c10::intrusive_ptr<Work> recv(
       std::vector<at::Tensor>& tensors,
       int srcRank,
       int tag) override {
     PYBIND11_OVERRIDE(
-        c10::intrusive_ptr<ProcessGroup::Work>, /* Return type */
+        c10::intrusive_ptr<Work>, /* Return type */
         ProcessGroup, /* Parent class */
         recv, /* Name of function in C++ */
         tensors,
diff --git a/torch/csrc/distributed/c10d/UCCUtils.cpp b/torch/csrc/distributed/c10d/UCCUtils.cpp
index 558568baf756..e7c5676e7986 100644
--- a/torch/csrc/distributed/c10d/UCCUtils.cpp
+++ b/torch/csrc/distributed/c10d/UCCUtils.cpp
@@ -17,75 +17,6 @@ constexpr char kAllGatherDone[] = "ag_done";
 constexpr char kAllGatherFree[] = "ag_free";
 } // namespace
 
-CommUCX::CommUCX(
-    int comm_size,
-    const c10::intrusive_ptr<ProcessGroupUCCLogger>& logger)
-    : CommBase(logger) {
-  ucp_params_t params;
-  ucp_config_t* config;
-  ucs_status_t st;
-  ucp_worker_params_t worker_params;
-  ucp_lib_attr_t ucp_attr;
-
-  ucp_attr.field_mask = UCP_LIB_ATTR_FIELD_MAX_THREAD_LEVEL;
-  TORCH_UCX_CHECK(
-      ucp_lib_query(&ucp_attr), "failed to query UCP lib attributes");
-  TORCH_CHECK(
-      ucp_attr.max_thread_level == UCS_THREAD_MODE_MULTI,
-      "ucx library wasn't initialized with multithreading support, "
-      "please check ucx build options");
-  TORCH_UCX_CHECK(
-      ucp_config_read("TORCH", nullptr, &config), "failed to read UCP config");
-
-  memset(&params, 0, sizeof(ucp_params_t));
-  params.field_mask = UCP_PARAM_FIELD_FEATURES | UCP_PARAM_FIELD_REQUEST_SIZE |
-      UCP_PARAM_FIELD_ESTIMATED_NUM_EPS | UCP_PARAM_FIELD_TAG_SENDER_MASK |
-      UCP_PARAM_FIELD_REQUEST_INIT | UCP_PARAM_FIELD_REQUEST_CLEANUP;
-  params.request_size = sizeof(ucc_coll_req_t);
-  params.features = UCP_FEATURE_TAG;
-  params.estimated_num_eps = comm_size;
-  params.tag_sender_mask = TORCH_UCX_RANK_MASK;
-  params.request_init = [](void* request) {
-    static_cast<ucc_coll_req_h>(request)->status = UCC_INPROGRESS;
-  };
-  params.request_cleanup = [](void*) {};
-  TORCH_UCX_CHECK(
-      ucp_init(&params, config, &context), "failed to init UCP context");
-  ucp_config_release(config);
-
-  memset(&worker_params, 0, sizeof(ucp_worker_params_t));
-  worker_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE;
-  worker_params.thread_mode = UCS_THREAD_MODE_MULTI;
-  st = ucp_worker_create(context, &worker_params, &worker);
-  if (st != UCS_OK) {
-    TORCH_UCC_LOG_ERROR(
-        TORCH_UCC_INIT,
-        c10::str("UCX failed to create UCP worker:", ucs_status_string(st)));
-    ucp_cleanup(context);
-    throw std::runtime_error(ucs_status_string(st));
-  }
-}
-
-void CommUCX::progress() {
-  ucp_worker_progress(worker);
-}
-
-void CommUCX::free_request(ucc_coll_req_h request) {
-  request->status = UCC_INPROGRESS;
-  ucp_request_free(request);
-}
-
-CommUCX::~CommUCX() {
-  if (worker != nullptr) {
-    ucp_worker_destroy(worker);
-  }
-  if (context != nullptr) {
-    ucp_cleanup(context);
-  }
-  worker = nullptr;
-  context = nullptr;
-}
-
 ucc_status_t oob_allgather(
     void* sbuf,
     void* rbuf,
diff --git a/torch/csrc/distributed/c10d/UCCUtils.hpp b/torch/csrc/distributed/c10d/UCCUtils.hpp
index bc1bb1dde3a9..8f69f5ae6a14 100644
--- a/torch/csrc/distributed/c10d/UCCUtils.hpp
+++ b/torch/csrc/distributed/c10d/UCCUtils.hpp
@@ -5,28 +5,6 @@
 #include <c10d/ProcessGroup.hpp>
 #include <c10d/Store.hpp>
 #include <ucc/api/ucc.h>
-#include <ucp/api/ucp.h>
-
-#define TORCH_UCX_COMM_BITS 15
-#define TORCH_UCX_RANK_BITS 16
-#define TORCH_UCX_TAG_BITS 32
-#define TORCH_UCX_OOB_BITS 1
-
-#define TORCH_UCX_COMM_BITS_OFFSET 0
-#define TORCH_UCX_RANK_BITS_OFFSET TORCH_UCX_COMM_BITS
-#define TORCH_UCX_TAG_BITS_OFFSET (TORCH_UCX_COMM_BITS + TORCH_UCX_RANK_BITS)
-#define TORCH_UCX_OOB_BITS_OFFSET \
-  (TORCH_UCX_COMM_BITS + TORCH_UCX_RANK_BITS + TORCH_UCX_TAG_BITS)
-
-#define TORCH_UCX_MAX_COMM ((((uint64_t)1) << TORCH_UCX_COMM_BITS) - 1)
-#define TORCH_UCX_MAX_RANK ((((uint64_t)1) << TORCH_UCX_RANK_BITS) - 1)
-#define TORCH_UCX_MAX_TAG ((((uint64_t)1) << TORCH_UCX_TAG_BITS) - 1)
-#define TORCH_UCX_MAX_OOB ((((uint64_t)1) << TORCH_UCX_OOB_BITS) - 1)
-
-#define TORCH_UCX_COMM_MASK (TORCH_UCX_MAX_COMM << TORCH_UCX_COMM_BITS_OFFSET)
-#define TORCH_UCX_RANK_MASK (TORCH_UCX_MAX_RANK << TORCH_UCX_RANK_BITS_OFFSET)
-#define TORCH_UCX_TAG_MASK (TORCH_UCX_MAX_TAG << TORCH_UCX_TAG_BITS_OFFSET)
-#define TORCH_UCX_OOB_MASK (TORCH_UCX_MAX_OOB << TORCH_UCX_OOB_BITS_OFFSET)
 
 namespace c10d {
 
@@ -53,29 +31,6 @@ namespace c10d {
     }                                     \
   } while (0)
 
-// Macro to throw on a non-successful UCX return value.
-#define TORCH_UCX_CHECK(_cmd, _error_msg) \
-  do {                                    \
-    ucs_status_t result = _cmd;           \
-    if (result != UCS_OK) {               \
-      std::string err = c10::str(         \
-          "[",                            \
-          std::string(__FILE__),          \
-          ":",                            \
-          std::to_string(__LINE__),       \
-          "] ",                           \
-          logger->getLogPrefix(),         \
-          _error_msg,                     \
-          ", error code ",                \
-          result,                         \
-          ": ",                           \
-          ucs_status_string(result),      \
-          ", system error code ",         \
-          errno);                         \
-      TORCH_CHECK(false, err);            \
-    }                                     \
-  } while (0)
-
 // Macros to print logs with unified format
 #define TORCH_UCC_LOG_ERROR(_phase, _msg) \
   LOG(ERROR) << logger->getLogPrefix(_phase) << "[ERROR] " << _msg;
@@ -148,21 +103,6 @@ class CommBase {
   virtual ~CommBase() {}
   c10::intrusive_ptr<ProcessGroupUCCLogger> logger;
 };
-
-class CommUCX : public CommBase {
- public:
-  ucp_context_h context{nullptr};
-  ucp_worker_h worker{nullptr};
-
- public:
-  void progress() override;
-  void free_request(ucc_coll_req_h request) override;
-  CommUCX(
-      int comm_size,
-      const c10::intrusive_ptr<ProcessGroupUCCLogger>& logger);
-  ~CommUCX();
-};
-
 class CommUCC : public CommBase {
  public:
   ucc_lib_h lib{nullptr};
diff --git a/torch/csrc/distributed/c10d/Work.cpp b/torch/csrc/distributed/c10d/Work.cpp
new file mode 100644
index 000000000000..685776f21464
--- /dev/null
+++ b/torch/csrc/distributed/c10d/Work.cpp
@@ -0,0 +1,182 @@
+#include <ATen/ThreadLocalState.h>
+
+#include <c10d/Work.hpp>
+
+namespace c10d {
+
+Work::Work(
+    int rank,
+    OpType opType,
+    const char* profilingTitle,
+    const c10::optional<std::vector<at::Tensor>>& inputTensors)
+    : rank_(rank), opType_(opType) {
+  if (profilingTitle != nullptr) {
+    auto recordingFunction =
+        std::make_shared<at::RecordFunction>(at::RecordScope::USER_SCOPE);
+    if (recordingFunction->isActive()) {
+      // Work events follow a future like pattern and can potentially be marked
+      // as complete by different threads, so explicitly set as async event.
+      recordingFunction->_setAsync();
+      // Passing input tensor to recordFunction allows for shape information in
+      // profiling output.
+      std::vector<c10::IValue> inputs;
+      if (inputTensors) {
+        inputs.reserve(inputTensors->size());
+        for (const auto& tensor : *inputTensors) {
+          inputs.emplace_back(tensor);
+        }
+      }
+      recordingFunction->before(
+          profilingTitle,
+          c10::ArrayRef<const c10::IValue>(inputs.data(), inputs.size()));
+      std::function<void()> end_handler = [recordingFunction]() {
+        recordingFunction->end();
+      };
+      recordFunctionEndCallback_ = at::wrapPropagateTLSState(end_handler);
+    }
+  }
+}
+
+OpType Work::retrieveOpType() {
+  return opType_;
+}
+
+Work::~Work() = default;
+
+bool Work::isCompleted() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return completed_;
+}
+
+bool Work::isSuccess() const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return !exception_;
+}
+
+std::exception_ptr Work::exception() const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return exception_;
+}
+
+int Work::sourceRank() const {
+  TORCH_CHECK(
+      false,
+      "sourceRank() may only be called on work objects "
+      "that correspond to a recv or recv-from-any call.");
+}
+
+std::vector<at::Tensor> Work::result() {
+  TORCH_CHECK(false, "result() not implemented.");
+}
+
+void Work::synchronize() {}
+
+bool Work::wait(std::chrono::milliseconds timeout) {
+  std::unique_lock<std::mutex> lock(mutex_);
+  if (timeout == kNoTimeout) {
+    // This waits without a timeout.
+    cv_.wait(lock, [&] { return completed_; });
+  } else {
+    // Waits for the user-provided timeout.
+    cv_.wait_for(lock, timeout, [&] { return completed_; });
+    if (!completed_) {
+      // Throw exception if the wait operation timed out and the work was not
+      // completed.
+      TORCH_CHECK(false, "Operation timed out!");
+    }
+  }
+  if (exception_) {
+    std::rethrow_exception(exception_);
+  }
+  synchronize();
+  // Always return true, because abort API is not implemented.
+  return true;
+}
+
+void Work::abort() {
+  TORCH_CHECK(false, "Work::abort not implemented.");
+}
+
+c10::intrusive_ptr<c10::ivalue::Future> Work::getFuture() {
+  TORCH_CHECK(false, "Work::getFuture not implemented.")
+}
+
+void Work::finish(std::exception_ptr exception) {
+  std::unique_lock<std::mutex> lock(mutex_);
+  completed_ = true;
+  exception_ = exception;
+  if (recordFunctionEndCallback_) {
+    recordFunctionEndCallback_();
+    recordFunctionEndCallback_ = nullptr;
+  }
+  lock.unlock();
+  cv_.notify_all();
+}
+
+void Work::finishAndThrow(std::exception_ptr exception) {
+  std::unique_lock<std::mutex> lock(mutex_);
+  completed_ = true;
+  exception_ = exception;
+  if (recordFunctionEndCallback_) {
+    recordFunctionEndCallback_();
+    recordFunctionEndCallback_ = nullptr;
+  }
+  if (exception_) {
+    std::rethrow_exception(exception_);
+  }
+}
+
+class FutureWrappingWork : public Work {
+ public:
+  FutureWrappingWork(c10::intrusive_ptr<c10::ivalue::Future> fut)
+      : Work(), _fut(fut) {}
+
+  ~FutureWrappingWork() {}
+
+  bool isCompleted() override {
+    return _fut->completed();
+  }
+
+  bool isSuccess() const override {
+    return _fut->hasValue();
+  }
+
+  std::exception_ptr exception() const override {
+    return _fut->exception_ptr();
+  }
+
+  int sourceRank() const override {
+    TORCH_CHECK(false, "FutureWrappingWork::sourceRank() not implemented");
+  }
+
+  std::vector<at::Tensor> result() override {
+    return _fut->value().toPyObjectHolder()->extractTensors();
+  }
+
+  bool wait(std::chrono::milliseconds timeout) override {
+    // FIXME
+    TORCH_CHECK(
+        timeout == kNoTimeout,
+        "FutureWrappingWork::wait() with finite timeout not implemented");
+    _fut->wait();
+    return true;
+  }
+
+  void abort() override {
+    TORCH_CHECK(false, "FutureWrappingWork::abort() not implemented");
+  }
+
+  c10::intrusive_ptr<c10::ivalue::Future> getFuture() override {
+    return _fut;
+  }
+
+ private:
+  c10::intrusive_ptr<c10::ivalue::Future> _fut;
+};
+
+c10::intrusive_ptr<Work> Work::create_from_future(
+    c10::intrusive_ptr<c10::ivalue::Future> future) {
+  return c10::make_intrusive<FutureWrappingWork>(future);
+}
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/Work.hpp b/torch/csrc/distributed/c10d/Work.hpp
new file mode 100644
index 000000000000..252fc4205a02
--- /dev/null
+++ b/torch/csrc/distributed/c10d/Work.hpp
@@ -0,0 +1,138 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <stdexcept>
+#include <vector>
+
+constexpr auto kNoTimeout = std::chrono::milliseconds(0);
+
+namespace c10d {
+
+constexpr const char* const kSeqNumStoreKey = "SEQ_NUM_STORE_KEY";
+
+enum class OpType : std::uint8_t {
+  BROADCAST = 0,
+  ALLREDUCE = 1,
+  ALLREDUCE_COALESCED = 2,
+  REDUCE = 3,
+  ALLGATHER = 4,
+  _ALLGATHER_BASE = 5,
+  ALLGATHER_COALESCED = 6,
+  GATHER = 7,
+  SCATTER = 8,
+  REDUCE_SCATTER = 9,
+  ALLTOALL_BASE = 10,
+  ALLTOALL = 11,
+  SEND = 12,
+  RECV = 13,
+  RECVANYSOURCE = 14,
+  BARRIER = 15,
+  _REDUCE_SCATTER_BASE = 16,
+  UNKNOWN = 100,
+};
+
+// Converts OpType to human readable string.
+TORCH_API std::string opTypeToString(OpType opType);
+
+// Whether or not an OP is an p2p op (SEND, RECV, RECVANYSOURCE)
+TORCH_API bool isP2POp(OpType opType, bool batchP2P = false);
+
+// Please do not use Work API, it is going away, to be
+// replaced by ivalue::Future.
+// Python binding for this class might change, please do not assume
+// this will be bound using pybind.
+class TORCH_API Work : public torch::CustomClassHolder {
+ public:
+  Work(
+      int rank = -1,
+      OpType opType = OpType::UNKNOWN,
+      const char* profilingTitle = nullptr,
+      const c10::optional<std::vector<at::Tensor>>& inputTensors =
+          c10::nullopt);
+
+  virtual ~Work();
+
+  // Checks if request has completed. Non-blocking operation.
+  virtual bool isCompleted();
+
+  // Returns if the work completed successfully.
+  // If false, the exception function can be called to get details.
+  virtual bool isSuccess() const;
+
+  // Returns exception if isSuccess() returned false.
+  virtual std::exception_ptr exception() const;
+
+  // Returns source rank if this objects represents a recv-from-any.
+  virtual int sourceRank() const;
+
+  // Returns result tensors, if applicable.
+  // If work is not supposed to have result, we return empty list.
+  virtual std::vector<at::Tensor> result();
+
+  // Ensures that operations on the output tensors that are invoked
+  // after this function returns are correctly sequenced after the
+  // asynchronous completion of this work.
+  //
+  // For CUDA tensors, it inserts stream synchronization such that
+  // the streams of the caller wait for completion of the
+  // asynchronous operations on the destination tensors.
+  //
+  // For CPU tensors, it is currently a nop.
+  //
+  // This function should only be used if the caller polls for
+  // completion through the `isCompleted` function, it has returned
+  // true, and the `isSuccess` function also has returned true.
+  //
+  virtual void synchronize();
+
+  // Waits until request completes. Blocking operation.
+  // Throws if the work completed with an exception.
+  // Returns false if the work is aborted.
+  // Otherwise, it always returns true, indicating the work is completed.
+  //
+  // Functionally equivalent to:
+  //
+  //   while (!isCompleted()) { /* nop */ }
+  //   auto success = isSuccess();
+  //   if (!success) { std::rethrow_exception(exception()); }
+  //   return success;
+  //
+  virtual bool wait(std::chrono::milliseconds timeout = kNoTimeout);
+
+  virtual void abort();
+
+  // Returns a Future object that will be associated with the completion of
+  // work. Only NCCL backend is currently supported.
+  virtual c10::intrusive_ptr<c10::ivalue::Future> getFuture();
+
+  OpType retrieveOpType();
+
+  static c10::intrusive_ptr<Work> create_from_future(
+      c10::intrusive_ptr<c10::ivalue::Future>);
+
+ protected:
+  // Completes the work object and optionally sets the exception in a
+  // thread-safe manner. Notifies all waiting condition variables as well.
+  void finish(std::exception_ptr exception = nullptr);
+
+  // Similar to finish, but throws an exception if one is already set or
+  // provided by the user.
+  void finishAndThrow(std::exception_ptr exception);
+
+  mutable std::mutex mutex_;
+  std::condition_variable cv_;
+  bool completed_ = false;
+  std::exception_ptr exception_;
+
+  // Current rank of the node.
+  const int rank_;
+
+  // Operation type that this work object refers to.
+  OpType opType_;
+
+  // When profiling, the callback to record end of operation event. This
+  // callback needs to be called when collective operation is complete.
+  std::function<void()> recordFunctionEndCallback_;
+};
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/comm.cpp b/torch/csrc/distributed/c10d/comm.cpp
index d4c26d99bb0c..38b024b97d79 100644
--- a/torch/csrc/distributed/c10d/comm.cpp
+++ b/torch/csrc/distributed/c10d/comm.cpp
@@ -48,7 +48,7 @@ class BroadcastWork {
 
  private:
   // The broadcast work that is kicked off upon construction.
-  c10::intrusive_ptr<c10d::ProcessGroup::Work> work_;
+  c10::intrusive_ptr<c10d::Work> work_;
 };
 
 } // namespace
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index f28c389c025a..ef80232c43e1 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1644,60 +1644,59 @@ Example::
 #endif
 
   py::class_<
-      ::c10d::ProcessGroup::Work,
-      c10::intrusive_ptr<::c10d::ProcessGroup::Work>,
+      ::c10d::Work,
+      c10::intrusive_ptr<::c10d::Work>,
       ::c10d::PyProcessGroup::PyWork>(module, "Work")
       .def(py::init<>())
-      .def("is_completed", &::c10d::ProcessGroup::Work::isCompleted)
+      .def("is_completed", &::c10d::Work::isCompleted)
       .def(
           "is_success",
-          [](::c10d::ProcessGroup::Work& work) -> bool {
-            TORCH_WARN_ONCE(fmt::format(
-                kDeprecationWarning, "ProcessGroup::Work::is_success"));
+          [](::c10d::Work& work) -> bool {
+            TORCH_WARN_ONCE(
+                fmt::format(kDeprecationWarning, "Work::is_success"));
             return work.isSuccess();
           })
       .def(
           "exception",
-          [](::c10d::ProcessGroup::Work& work) -> std::exception_ptr {
-            TORCH_WARN_ONCE(fmt::format(
-                kDeprecationWarning, "ProcessGroup::Work::exception"));
+          [](::c10d::Work& work) -> std::exception_ptr {
+            TORCH_WARN_ONCE(
+                fmt::format(kDeprecationWarning, "Work::exception"));
             return work.exception();
           })
       .def(
           "source_rank",
-          [](::c10d::ProcessGroup::Work& work) -> int {
-            TORCH_WARN_ONCE(fmt::format(
-                kDeprecationWarning, "ProcessGroup::Work::source_rank"));
+          [](::c10d::Work& work) -> int {
+            TORCH_WARN_ONCE(
+                fmt::format(kDeprecationWarning, "Work::source_rank"));
             return work.sourceRank();
           })
-      .def("_source_rank", &::c10d::ProcessGroup::Work::sourceRank)
+      .def("_source_rank", &::c10d::Work::sourceRank)
       .def(
           "result",
-          [](::c10d::ProcessGroup::Work& work) -> std::vector<at::Tensor> {
+          [](::c10d::Work& work) -> std::vector<at::Tensor> {
             return work.result();
           })
       .def(
           "synchronize",
-          [](::c10d::ProcessGroup::Work& work) -> void {
-            TORCH_WARN_ONCE(fmt::format(
-                kDeprecationWarning, "ProcessGroup::Work::synchronize"));
+          [](::c10d::Work& work) -> void {
+            TORCH_WARN_ONCE(
+                fmt::format(kDeprecationWarning, "Work::synchronize"));
             work.synchronize();
           })
       .def(
           "wait",
-          &::c10d::ProcessGroup::Work::wait,
+          &::c10d::Work::wait,
           py::arg("timeout") = kNoTimeout,
           py::call_guard<py::gil_scoped_release>())
       .def(
           "get_future",
-          [](::c10d::ProcessGroup::Work& work)
-              -> std::shared_ptr<jit::PythonFutureWrapper> {
+          [](::c10d::Work& work) -> std::shared_ptr<jit::PythonFutureWrapper> {
             return std::make_shared<jit::PythonFutureWrapper>(work.getFuture());
           },
           R"(
             Returns:
                 A ``torch.futures.Future`` object which is associated with the completion of
-                the ``ProcessGroup::Work``. As an example, a future object can be retrieved
+                the ``Work``. As an example, a future object can be retrieved
                 by ``fut = process_group.allreduce(tensors).get_future()``.
 
             Example::
@@ -1866,14 +1865,14 @@ Example::
   module.def(
       "_create_work_from_future",
       [](std::shared_ptr<jit::PythonFutureWrapper> future) {
-        return ::c10d::ProcessGroup::Work::create_from_future(future->fut);
+        return ::c10d::Work::create_from_future(future->fut);
       },
       py::arg("future"),
       R"(
         Arguments:
             future(str): The future to wrap.
         Returns:
-            A ``ProcessGroup::Work`` object which is associated with the completion of
+            A ``Work`` object which is associated with the completion of
             the ``torch.futures.Future``.
         This is the prefered way of constructing Work objects when writing a custom ProcessGroup
         in python.
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index 07bf8e0b73c2..541a32f16e19 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -474,7 +474,7 @@ std::vector<c10d::GradBucket> Reducer::get_grad_buckets(
 }
 
 void Reducer::set_forward_pass_work_handle(
-    c10::intrusive_ptr<c10d::ProcessGroup::Work> forwardPassWorkHandle,
+    c10::intrusive_ptr<c10d::Work> forwardPassWorkHandle,
     bool useStaticWorldSize) {
   std::lock_guard<std::mutex> lock(mutex_);
   forwardPassWorkHandle_.workHandle = std::move(forwardPassWorkHandle);
diff --git a/torch/csrc/distributed/c10d/reducer.hpp b/torch/csrc/distributed/c10d/reducer.hpp
index cc14a1eb2be6..c040bb32bd1b 100644
--- a/torch/csrc/distributed/c10d/reducer.hpp
+++ b/torch/csrc/distributed/c10d/reducer.hpp
@@ -133,10 +133,10 @@ class TORCH_API Reducer {
   // Pushes all parameters to be rebuilt.
   void push_rebuilt_params_for_all_indices();
 
-  // Creates and sets ForwardPassWorkHandle given a ProcessGroup::Work and the
+  // Creates and sets ForwardPassWorkHandle given a Work and the
   // corresponding tensor being reduced.
   void set_forward_pass_work_handle(
-      c10::intrusive_ptr<c10d::ProcessGroup::Work> forwardPassWorkHandle,
+      c10::intrusive_ptr<c10d::Work> forwardPassWorkHandle,
       bool useStaticWorldSize);
 
   // Retrieve on-device tensors used to track locally unused parameters. It is
@@ -231,7 +231,7 @@ class TORCH_API Reducer {
   c10::optional<c10::List<c10::intrusive_ptr<c10::ivalue::Future>>> installed_futures_{c10::nullopt};
 
   // Work handle for allreduce on local_used_map_
-  c10::intrusive_ptr<c10d::ProcessGroup::Work> local_used_work_;
+  c10::intrusive_ptr<c10d::Work> local_used_work_;
 
   void mark_variable_ready_dense(size_t variable_index);
 
@@ -433,7 +433,7 @@ class TORCH_API Reducer {
   // A struct containing work handle and tensor for allreduce scheduled in
   // forward pass, if applicable.
   struct ForwardPassAllreduceWork {
-    c10::intrusive_ptr<c10d::ProcessGroup::Work> workHandle;
+    c10::intrusive_ptr<c10d::Work> workHandle;
     at::Tensor resultTensor;
     // whether we should divide by the initial world_size or the no. of
     // remaining DDP ranks.
diff --git a/torch/csrc/distributed/rpc/rref_context.cpp b/torch/csrc/distributed/rpc/rref_context.cpp
index 004e9422be42..d620fe6b9465 100644
--- a/torch/csrc/distributed/rpc/rref_context.cpp
+++ b/torch/csrc/distributed/rpc/rref_context.cpp
@@ -114,6 +114,14 @@ void RRefContext::handleException(const JitFuture& jitFuture) {
   }
 }
 
+void RRefContext::handleExceptionSilent(const JitFuture& jitFuture) {
+  if (jitFuture.hasError()) {
+    auto errMsg = jitFuture.tryRetrieveErrorMessage();
+    VLOG(1) << "Got exception: " << errMsg;
+    TORCH_CHECK_MSG(false, errMsg);
+  }
+}
+
 RRefContext::RRefContext(std::shared_ptr<RpcAgent> agent)
     : agent_(std::move(agent)), destroyed_(false) {}
 
@@ -219,7 +227,7 @@ void RRefContext::delUser(
           RRefUserDelete(rrefId, forkId).toMessage());
 
       jitFuture->addCallback([this](JitFuture& future) {
-        handleException(future);
+        handleExceptionSilent(future);
         --numPendingFutures_;
       });
     }
@@ -507,7 +515,7 @@ void RRefContext::notifyOwnerAndParentOfFork(
     auto jitFuture = agent_->sendWithRetries(
         agent_->getWorkerInfo(parent), RRefChildAccept(forkId).toMessage());
     jitFuture->addCallback([this](JitFuture& future) {
-      handleException(future);
+      handleExceptionSilent(future);
       --numPendingFutures_;
     });
   } else {
@@ -706,7 +714,7 @@ void RRefContext::finishForkRequest(const ForkId& forkId, worker_id_t parent) {
       agent_->getWorkerInfo(parent), RRefChildAccept(forkId).toMessage());
 
   jitFuture->addCallback([this](JitFuture& future) {
-    handleException(future);
+    handleExceptionSilent(future);
     --numPendingFutures_;
   });
 }
diff --git a/torch/csrc/distributed/rpc/rref_context.h b/torch/csrc/distributed/rpc/rref_context.h
index 675cabc02bcd..78f1b3afb731 100644
--- a/torch/csrc/distributed/rpc/rref_context.h
+++ b/torch/csrc/distributed/rpc/rref_context.h
@@ -39,6 +39,9 @@ class TORCH_API RRefContext {
 
   static void handleException(const JitFuture& jitFuture);
 
+  // handle exception without throw ::c10::Error again
+  static void handleExceptionSilent(const JitFuture& jitFuture);
+
   RRefContext(const RRefContext&) = delete;
   RRefContext(RRefContext&& other) = delete;
   void operator=(const RRefContext&) = delete;
diff --git a/torch/csrc/jit/OVERVIEW.md b/torch/csrc/jit/OVERVIEW.md
index e93d047d6311..c1bcd57c73a5 100644
--- a/torch/csrc/jit/OVERVIEW.md
+++ b/torch/csrc/jit/OVERVIEW.md
@@ -1200,6 +1200,8 @@ or switching the fuser could also provide a temporary fix in case of bugs.
 | NNC context manager | `with torch.jit.fuser("fuser1"):` |
 | NVFuser enable/disable | `torch._C._jit_set_nvfuser_enabled()` |
 | NVFuser context manager | `with torch.jit.fuser("fuser2")` |
+| oneDNN Graph on CPU | `torch._C._jit_set_llga_enabled(True)` |
+| oneDNN Graph context manager | `with torch.jit.fuser("fuser3"):` |
 
 **C++ APIs:**
 
diff --git a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm
index 77b4503174c8..67fc0221368d 100644
--- a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm
+++ b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm
@@ -143,7 +143,12 @@ GenericList execute(IValue handle, GenericList inputs) override {
     PTMCoreMLExecutor *executor = model_wrapper->executor;
     [executor setInputs:inputs];
 
-    id<MLFeatureProvider> outputsProvider = [executor forward];
+    NSError *error;
+    id<MLFeatureProvider> outputsProvider = [executor forward:&error];
+    if (!outputsProvider) {
+      TORCH_CHECK(false, [[error description] UTF8String]);
+    }
+
     return pack_outputs(model_wrapper->outputs, outputsProvider);
   }
 
@@ -162,7 +167,7 @@ bool is_available() override {
 
 struct PTMCoreMLContext : public ContextInterface {
   void setModelCacheDirectory(std::string dir) override {
-    [PTMCoreMLCompiler setModelCacheDirectory:dir];
+    [PTMCoreMLCompiler setCacheDirectory:dir];
   }
 };
 
diff --git a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLCompiler.h b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLCompiler.h
index 59f3922c5a3d..d488f5b6e71d 100644
--- a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLCompiler.h
+++ b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLCompiler.h
@@ -6,9 +6,9 @@ NS_ASSUME_NONNULL_BEGIN
 
 @interface PTMCoreMLCompiler : NSObject
 
-+ (void)setModelCacheDirectory:(const std::string&)dir;
++ (void)setCacheDirectory:(const std::string&)dir;
 
-+ (NSString*)modelCacheDirectory;
++ (NSString*)cacheDirectory;
 
 + (BOOL)compileModel:(const std::string&)modelSpecs
              modelID:(const std::string&)modelID;
diff --git a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLCompiler.mm b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLCompiler.mm
index 60a6dd4d005a..64201f3e3745 100644
--- a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLCompiler.mm
+++ b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLCompiler.mm
@@ -6,136 +6,133 @@
 
 @implementation PTMCoreMLCompiler
 
-static NSString* gModelCacheDirectory = @"";
+static NSString *gCacheDirectory = @"";
+static NSString *gCompiledModelExtension = @"mlmodelc";
+static NSString *gVersionExtension = @"version";
 
-+ (void)setModelCacheDirectory:(const std::string&)dir {
-  gModelCacheDirectory = [NSString stringWithCString:dir.c_str()];
++ (void)setCacheDirectory:(const std::string&)dir {
+  gCacheDirectory = [NSString stringWithCString:dir.c_str()];
 }
 
-+ (nonnull NSString *)modelCacheDirectory {
-  BOOL isSet = gModelCacheDirectory.length != 0;
-  BOOL isWriteable = isSet && [[NSFileManager defaultManager] isWritableFileAtPath:gModelCacheDirectory];
++ (nonnull NSString *)cacheDirectory {
+  BOOL isSet = gCacheDirectory.length != 0;
+  BOOL isWriteable = isSet && [[NSFileManager defaultManager] isWritableFileAtPath:gCacheDirectory];
   if (!isSet || !isWriteable) {
     // set the default directory to tmp
-    gModelCacheDirectory = NSTemporaryDirectory();
+    gCacheDirectory = NSTemporaryDirectory();
   }
-  return gModelCacheDirectory;
+  return gCacheDirectory;
 }
 
 + (BOOL)compileModel:(const std::string&)modelSpecs modelID:(const std::string&)modelID {
-  NSString* modelName = [NSString stringWithCString:modelID.c_str() encoding:NSUTF8StringEncoding];
-  NSURL* modelPath = [PTMCoreMLCompiler _cacheFilePath:modelName];
-  NSURL* compiledModelPath = [modelPath URLByAppendingPathExtension:@"mlmodelc"];
-
-  BOOL modelIsCached = [[NSFileManager defaultManager] fileExistsAtPath:modelPath.path];
-  BOOL compiledModelIsCached = [[NSFileManager defaultManager] fileExistsAtPath:compiledModelPath.path];
+  NSString *modelName = [NSString stringWithCString:modelID.c_str() encoding:NSUTF8StringEncoding];
+  NSString *modelPath = [NSTemporaryDirectory() stringByAppendingPathComponent:modelName];
+  NSURL *compiledURL = [PTMCoreMLCompiler _cacheURLForModel:modelName extension:gCompiledModelExtension];
+  BOOL compiledModelIsCached = [[NSFileManager defaultManager] fileExistsAtPath:compiledURL.path];
 
 #if TARGET_OS_IPHONE
-  NSError *error = nil;
-  NSURL *compilationOSPath = [modelPath URLByAppendingPathExtension:@"version"];
-  NSString *compilationOS = [NSString stringWithContentsOfFile:compilationOSPath.path encoding:NSUTF8StringEncoding error:&error];
+  NSURL *versionURL = [PTMCoreMLCompiler _cacheURLForModel:modelName extension:gVersionExtension];
+  NSString *compilationOS = [NSString stringWithContentsOfFile:versionURL.path encoding:NSUTF8StringEncoding error:nil];
   NSString *currentOS = [UIDevice currentDevice].systemVersion;
   BOOL wasCachedOnThisOS = [currentOS isEqualToString:compilationOS];
 #else
   BOOL wasCachedOnThisOS = NO;
 #endif
 
-  if (modelIsCached != compiledModelIsCached || !wasCachedOnThisOS) {
-    modelIsCached = NO;
-    compiledModelIsCached = NO;
-    [PTMCoreMLCompiler _cleanupCachedModel:modelID];
+  if (compiledModelIsCached && wasCachedOnThisOS) {
+    return YES;
   }
 
-  if (!modelIsCached) {
-    // Note that the serialized protobuf binary contains bytes not text.
-    // https://developers.google.com/protocol-buffers/docs/pythontutorial#parsing-and-serialization
-    NSData* data = [NSData dataWithBytes:modelSpecs.c_str() length:modelSpecs.length()];
-    if (![data writeToFile:modelPath.path atomically:YES]) {
-      // If the model cannot be persisted on disk then compilation cannot proceed.
-      NSLog(@"Failed to save specs for MLModel!");
-      [PTMCoreMLCompiler _cleanupCachedModel:modelID];
-      return NO;
-    }
+  if (!wasCachedOnThisOS) {
+    [PTMCoreMLCompiler _cleanupCachedModel:modelName];
   }
 
-  if (compiledModelIsCached) {
-    return YES;
+  BOOL writeSuccess = [PTMCoreMLCompiler _writeModelSpecs:modelSpecs toPath:modelPath];
+  if (!writeSuccess) {
+    return NO;
   }
 
-  return [PTMCoreMLCompiler _compileModel:modelID atPath:modelPath andCache:compiledModelPath];
+  return [PTMCoreMLCompiler _compileModel:modelName atPath:modelPath];
 }
 
 + (nullable MLModel*)loadModel:(const std::string&)modelID backend:(const std::string&)backend allowLowPrecision:(BOOL)allowLowPrecision {
-  NSString* modelName = [NSString stringWithCString:modelID.c_str() encoding:NSUTF8StringEncoding];
-  NSURL* modelPath = [PTMCoreMLCompiler _cacheFilePath:modelName];
-  NSURL* compiledModelPath = [modelPath URLByAppendingPathExtension:@"mlmodelc"];
+  NSString *modelName = [NSString stringWithCString:modelID.c_str() encoding:NSUTF8StringEncoding];
+  NSURL *modelURL = [PTMCoreMLCompiler _cacheURLForModel:modelName extension:gCompiledModelExtension];
 
   NSError *error;
   MLModel *model;
   if (@available(iOS 12.0, macOS 10.14, *)) {
     MLModelConfiguration* config = [[MLModelConfiguration alloc] init];
     MLComputeUnits computeUnits = MLComputeUnitsCPUOnly;
-    if (backend == "cpuandgpu") {
+    if (backend == "cpuAndGPU") {
       computeUnits = MLComputeUnitsCPUAndGPU;
     } else if (backend == "all") {
       computeUnits = MLComputeUnitsAll;
     }
     config.computeUnits = computeUnits;
     config.allowLowPrecisionAccumulationOnGPU = allowLowPrecision;
-    model = [MLModel modelWithContentsOfURL:compiledModelPath configuration:config error:&error];
+    model = [MLModel modelWithContentsOfURL:modelURL configuration:config error:&error];
   } else {
-    model = [MLModel modelWithContentsOfURL:compiledModelPath error:&error];
+    model = [MLModel modelWithContentsOfURL:modelURL error:&error];
   }
 
   if (error) {
-    NSLog(@"Failed to initialize MLModel!");
-    [PTMCoreMLCompiler _cleanupCachedModel:modelID];
+    [PTMCoreMLCompiler _cleanupCachedModel:modelName];
     return nil;
   }
 
   return model;
 }
 
-+ (BOOL)_compileModel:(const std::string&)modelID atPath:(NSURL *)modelPath andCache:(NSURL *)cachePath {
++ (BOOL)_writeModelSpecs:(const std::string&)modelSpecs toPath:(NSString *)modelPath {
+  // Note that the serialized protobuf binary contains bytes not text.
+  // https://developers.google.com/protocol-buffers/docs/pythontutorial#parsing-and-serialization
+  NSData* data = [NSData dataWithBytes:modelSpecs.c_str() length:modelSpecs.length()];
+  return [data writeToFile:modelPath atomically:YES];
+}
+
++ (BOOL)_compileModel:(NSString *)modelName atPath:(NSString *)modelPath {
   NSError *error;
-  NSURL *temporaryURL = [MLModel compileModelAtURL:modelPath error:&error];
-  if (!error) {
-#if TARGET_OS_IPHONE
-    NSURL *compilationOSPath = [modelPath URLByAppendingPathExtension:@"version"];
-    NSString *currentOSVer = [UIDevice currentDevice].systemVersion;
-    [currentOSVer writeToFile:compilationOSPath.path atomically:YES];
-#endif
-    [PTMCoreMLCompiler _moveFileToCache:temporaryURL cacheURL:cachePath error:&error];
+  NSURL *modelURL = [NSURL fileURLWithPath:modelPath];
+  NSURL *temporaryURL = [MLModel compileModelAtURL:modelURL error:&error];
+
+  // After the compiled model has been created, the original specs can be cleared to save cache space.
+  [[NSFileManager defaultManager] removeItemAtPath:modelPath error:nil];
+
+  if (error) {
+    return NO; // Model could not be compiled
+  }
+
+  NSURL *compiledURL = [PTMCoreMLCompiler _cacheURLForModel:modelName extension:gCompiledModelExtension];
+  if (![compiledURL isEqual:temporaryURL]) {
+    [[NSFileManager defaultManager] removeItemAtURL:compiledURL error:nil];
+    [[NSFileManager defaultManager] moveItemAtURL:temporaryURL toURL:compiledURL error:&error];
   }
+
   if (error) {
-    NSLog(@"Failed to compile MLModel!");
-    [PTMCoreMLCompiler _cleanupCachedModel:modelID];
+    return NO; // Model could not be saved in cache
   }
-  return !error;
-}
 
-+ (void)_cleanupCachedModel:(const std::string&)modelID {
-  NSString* modelName = [NSString stringWithCString:modelID.c_str() encoding:NSUTF8StringEncoding];
-  NSURL* modelPath = [PTMCoreMLCompiler _cacheFilePath:modelName];
-  NSURL* compiledModelPath = [modelPath URLByAppendingPathExtension:@"mlmodelc"];
-  NSURL* compilationOSPath = [modelPath URLByAppendingPathExtension:@"version"];
-  NSError* error = nil;
-  [[NSFileManager defaultManager] removeItemAtPath:modelPath.path error:&error];
-  [[NSFileManager defaultManager] removeItemAtPath:compiledModelPath.path error:&error];
-  [[NSFileManager defaultManager] removeItemAtPath:compilationOSPath.path error:&error];
+#if TARGET_OS_IPHONE
+  NSURL *versionURL = [PTMCoreMLCompiler _cacheURLForModel:modelName extension:gVersionExtension];
+  NSString *currentOSVer = [UIDevice currentDevice].systemVersion;
+  [currentOSVer writeToFile:versionURL.path atomically:YES];
+#endif
+
+  return YES;
 }
 
-+ (void)_moveFileToCache:(NSURL *)fileURL cacheURL:(NSURL *)cacheURL error:(NSError **)error {
-  if ([fileURL isEqual:cacheURL]) {
-    return;
-  }
-  [[NSFileManager defaultManager] removeItemAtURL:cacheURL error:nil];
-  [[NSFileManager defaultManager] moveItemAtURL:fileURL toURL:cacheURL error:error];
++ (void)_cleanupCachedModel:(NSString *)modelName {
+  NSURL *modelURL = [PTMCoreMLCompiler _cacheURLForModel:modelName extension:gCompiledModelExtension];
+  NSURL *versionURL = [PTMCoreMLCompiler _cacheURLForModel:modelName extension:gVersionExtension];
+  [[NSFileManager defaultManager] removeItemAtPath:modelURL.path error:nil];
+  [[NSFileManager defaultManager] removeItemAtPath:versionURL.path error:nil];
 }
 
-+ (NSURL *)_cacheFilePath:(NSString *)fileName {
-  NSString *filePath = [[PTMCoreMLCompiler modelCacheDirectory] stringByAppendingPathComponent:fileName];
-  return [NSURL fileURLWithPath:filePath];
++ (NSURL *)_cacheURLForModel:(NSString *)modelID extension:(NSString *)pathExtension {
+  NSString *filename = [modelID stringByAppendingPathExtension:pathExtension];
+  NSString *filePath = [[PTMCoreMLCompiler cacheDirectory] stringByAppendingPathComponent:filename];
+  return [NSURL fileURLWithPath:filePath isDirectory:NO];
 }
 
 @end
diff --git a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.h b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.h
index d38a37bf6f2f..35cc2ca10a56 100644
--- a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.h
+++ b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.h
@@ -12,7 +12,7 @@ NS_ASSUME_NONNULL_BEGIN
 
 - (void)setInputs:(c10::impl::GenericList)inputs;
 
-- (id<MLFeatureProvider>)forward;
+- (id<MLFeatureProvider>)forward:(NSError**)error;
 
 @end
 
diff --git a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.mm b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.mm
index b393cebd5216..40f4aea0f59c 100644
--- a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.mm
+++ b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.mm
@@ -35,15 +35,8 @@ - (void)setInputs:(c10::impl::GenericList)inputs {
   }
 }
 
-- (id<MLFeatureProvider>)forward {
-  NSError *error;
-  MLPredictionOptions *options = [[MLPredictionOptions alloc] init];
-  id<MLFeatureProvider> outputs = [self.model predictionFromFeatures:_inputProvider options:options error:&error];
-  if (error) {
-    NSLog(@"Prediction failed with error %@", error);
-    return nil;
-  }
-  return outputs;
+- (id<MLFeatureProvider>)forward:(NSError **)error {
+  return [self.model predictionFromFeatures:_inputProvider error:error];
 }
 
 @end
diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp
index cebb92089f71..b90aea38b123 100644
--- a/torch/csrc/jit/codegen/cuda/codegen.cpp
+++ b/torch/csrc/jit/codegen/cuda/codegen.cpp
@@ -264,6 +264,9 @@ class CudaKernelGenerator : private OptOutConstDispatch {
       indent()
           << "  static_cast<uint64_t>(*(philox_args.offset_.ptr) + philox_args.offset_intragraph_) :\n";
       indent() << "  philox_args.offset_.val;\n";
+      indent() << "auto seed = philox_args.captured_ ?\n";
+      indent()
+          << "  static_cast<uint64_t>(*(philox_args.seed_.ptr)) : philox_args.seed_.val;\n";
       indent() << "uint4 rng_result;\n";
       indent() << "nvfuser_index_t rng_subseq = -1;\n";
       indent() << "nvfuser_index_t rng_offset = -1;\n";
diff --git a/torch/csrc/jit/codegen/cuda/instrumentation.h b/torch/csrc/jit/codegen/cuda/instrumentation.h
index b929fffc4a12..ef89fcd66090 100644
--- a/torch/csrc/jit/codegen/cuda/instrumentation.h
+++ b/torch/csrc/jit/codegen/cuda/instrumentation.h
@@ -31,7 +31,7 @@ namespace inst {
 //! An easy way to view traces is to type `about://tracing` in Chrome or
 //! Chromium.
 //!
-class Trace : public NonCopyable {
+class TORCH_CUDA_CU_API Trace : public NonCopyable {
  public:
   using Clock = std::chrono::steady_clock;
 
@@ -73,7 +73,7 @@ class Trace : public NonCopyable {
 
 //! \internal Automatic scope for a perf marker
 //!   (normally used through the FUSER_PERF_SCOPE macro)
-class TraceScope : public NonCopyable {
+class TORCH_CUDA_CU_API TraceScope : public NonCopyable {
  public:
   explicit TraceScope(const char* event_name) : event_name_(event_name) {
     Trace::instance()->beginEvent(event_name_);
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/README.md b/torch/csrc/jit/codegen/cuda/python_frontend/README.md
new file mode 100644
index 000000000000..7f3364e05c69
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/README.md
@@ -0,0 +1,138 @@
+# nvFuser Python Frontend
+
+This frontend allows for a user to describe the set of operations for nvFuser to fuse via 1 or more kernels.  This frontend is intended to be an integration point with PyTorch or standalone applications.
+
+# Usage
+
+## Example 1 - Define and Execute a Fusion
+
+```python
+import torch
+from torch._C._nvfuser import Fusion, FusionDefinition, DataType
+
+fs = Fusion()
+with FusionDefinition(fs) as fd :
+    t0 = fd.define_tensor(symbolic_sizes=[-1, 1, -1],
+                          contiguous=[True, True, True],
+                          dtype=DataType.Float)
+    t1 = fd.define_tensor(3)
+    c0 = fd.define_constant(3.0)
+
+    t2 = fd.ops.add(t0, t1)
+    t3 = fd.ops.mul(t2, c0)
+    t4 = fd.ops.sum(t3, [-1], False, DataType.Float)
+
+    fd.add_output(t4)
+
+input1 = torch.ones(2, 1, 8, device='cuda')
+input2 = torch.ones(2, 4, 8, device='cuda')
+
+nvf_out = fs.execute([input1, input2])[0]
+```
+
+## Example 2 - Lookup and Execute a `Fusion` Based on Id
+
+```python
+fid = 0
+fs = Fusion(fid)
+
+input1 = torch.ones(2, 1, 8, device='cuda')
+input2 = torch.ones(2, 4, 8, device='cuda')
+
+nvf_out = fs.execute([input1, input2])[0]
+```
+
+## Components
+
+### `Fusion` - Represents a Fusion
+#### `Fusion` Methods
+* `defined()`: Allows you to query if the `Fusion` is already defined and can be executed.
+* `execute([inputs])`:  Allows you to execute the currently defined fusion with a list of given inputs and returns a list of tensors.
+* `id()`: Returns the fusion id for a given `Fusion`.
+* `print()`: Prints the low level IR for the currently defined fusion.
+
+### `FusionDefiniton` Context Manager - Interface for Defining Fusions
+
+#### Defining Input Tensors
+_All intermediate tensors are created by operations.  Constant tensors do not exist._
+
+There are 3 ways to define tensors that will be enumerated below.
+
+##### 1.) Defining tensors by the number of input dimensions only
+This interface tells nvFuser that the tensor has a given number of symbolic dimensions that are not necessarily contiguous in memory.  The user also has the ability to specify a data type.  The default type is `Float`.
+```python
+t0 = fd.define_tensor(3)
+t1 = fd.define_tensor(3, DataType.Half)
+```
+
+##### 2.) Defining tensors by a list of concrete sizes and a list of strides
+The `sizes` parameter defines the number of dimensions and the size of each dimension.  The `strides` parameter has to have the same number of dimensions as the `sizes` parameter.
+nvFuser translates the concrete sizes and strides into symbolic sizes and contiguity information that can be directly defined via the next way to define tensors.  This allows the user to directly take a Pytorch defined tensor and query its sizes and strides in order to apply them in the definition.
+```python
+t0 = fd.define_tensor(sizes=[2, 4, 6], strides=[24, 6, 1], dtype=DataType.Half)
+```
+
+##### 3.) Defining tensors by a list of symbolic sizes and a list of contiguity information
+The list of symbolic sizes defines the number of dimensions and `-1` is given for each dimension unless it is a broadcast dimension that is defined with a `1`.  The contiguity information is viewed from right to left.  A `True` definition indicates the current dimension is contiguous with the dimension to its right.
+
+```python
+t0 = fd.define_tensor(symbolic_sizes=[-1, 1, -1], contiguous=[True, True, True], dtype=DataType.Float)
+```
+
+#### Defining Input Scalars
+_All intermediate scalars, except for constants, are created by operations._
+
+The only thing the user has to define for a scalar is its type.
+
+```python
+s0 = fd.define_scalar(dtype=DataType.Half)
+```
+
+#### Defining Constant Scalars
+
+Constants can be of types: `Bool`, `ComplexDouble`, `Double`, or `Int`.  The definition only takes a constant and the type is inferred by the constant.
+
+```python
+c0 = fd.define_constant(3.0)
+```
+
+#### Defining Operations
+
+Operators are added with the following notation:
+```python
+output = fd.ops.foo(arg1, ... )
+```
+You can see a supported list of operations with the following query:
+```python
+python -c "from torch._C._nvfuser import FusionDefinition; help(FusionDefinition.Operators)"
+```
+#### Notating Outputs
+
+The `FusionDefintion` `add_output` method is used to indicate an intermediate is an output to the fusion.
+
+```python
+add_output(output: Tensor)
+# or
+add_output(output: Scalar)
+```
+
+# Debug Information
+**Query a list of supported operations:**
+```python
+python -c "from torch._C._nvfuser import FusionDefinition; help(FusionDefinition.Operators)"
+```
+**View the fusion definitions that are executed by setting an environment variable:**
+```python
+export PYTORCH_NVFUSER_DUMP=python_definition
+```
+Example Output:
+```python
+def nvfuser_fusion_id0(fd : FusionDefinition) -> None :
+    T0 = fd.define_tensor(symbolic_sizes=[-1, 1, -1], contiguous=[True, True, True], dtype=DataType.Float)
+    T1 = fd.define_tensor(symbolic_sizes=[-1, -1, -1], contiguous=[False, False, False], dtype=DataType.Float)
+    S2 = fd.define_constant(3.00000)
+    T3 = fd.ops.add(T0, T1)
+    T4 = fd.ops.mul(T3, S2)
+    T5 = fd.ops.sum(T4, axes=[-1], keepdim=False, dtype=DataType.Float)
+    fd.add_output(T5)
+```
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/examples/double_half_cast.py b/torch/csrc/jit/codegen/cuda/python_frontend/examples/double_half_cast.py
deleted file mode 100644
index b3ce49d32d97..000000000000
--- a/torch/csrc/jit/codegen/cuda/python_frontend/examples/double_half_cast.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import torch
-
-from torch._C._nvfuser import Fusion, FusionDefinition, DataType
-
-# Construct and Define Fusion
-fusion = Fusion()
-
-with FusionDefinition(fusion) as fd :
-    t0 = fd.define_tensor(2, DataType.Double)
-    t1 = fd.define_tensor(2, DataType.Double)
-
-    t0h = fd.ops.cast(t0, DataType.Half)
-    t1h = fd.ops.cast(t1, DataType.Half)
-    t2 = fd.ops.add(t0h, t1h)
-    t3 = fd.ops.relu(t2)
-
-    fd.add_output(t3)
-
-fusion.print_ir()
-
-# Execute Fusion
-input1 = torch.ones(2, 4, device='cuda', dtype=torch.float64)
-input2 = torch.ones(2, 4, device='cuda', dtype=torch.float64)
-
-# Kernel compilation should be cached for the 2nd iteration
-# with input tensors of the same shape
-for _ in range(5) :
-    outputs = fusion.execute([input1, input2])
-
-print(outputs[0])
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/examples/half_double_cast.py b/torch/csrc/jit/codegen/cuda/python_frontend/examples/half_double_cast.py
deleted file mode 100644
index d5f7070a4eeb..000000000000
--- a/torch/csrc/jit/codegen/cuda/python_frontend/examples/half_double_cast.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import torch
-
-from torch._C._nvfuser import Fusion, FusionDefinition, DataType
-
-# Construct and Define Fusion
-fusion = Fusion()
-
-with FusionDefinition(fusion) as fd :
-    t0 = fd.define_tensor(2, DataType.Half)
-    t1 = fd.define_tensor(2, DataType.Double)
-
-    t2 = fd.ops.add(t0, t1)
-    t5 = fd.ops.relu(t2)
-
-    fd.add_output(t5)
-
-fusion.print_ir()
-
-# Execute Fusion
-input1 = torch.ones(2, 4, device='cuda', dtype=torch.float16)
-input2 = torch.ones(2, 4, device='cuda', dtype=torch.float64)
-
-# Kernel compilation should be cached for the 2nd iteration
-# with input tensors of the same shape
-for _ in range(5) :
-    outputs = fusion.execute([input1, input2])
-
-print(outputs[0])
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example.py b/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example.py
deleted file mode 100644
index 2bd236c0cf2d..000000000000
--- a/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import torch
-from torch._C._nvfuser import Fusion, FusionDefinition, DataType
-
-# Construct and Define Fusion
-fusion = Fusion()
-
-with FusionDefinition(fusion) as fd :
-    t0 = fd.define_tensor(3)
-    t1 = fd.define_tensor(3)
-    s0 = fd.define_scalar()
-
-    c0 = fd.define_constant(3.0)
-
-    t2 = fd.ops.add(t0, t1)
-    t3 = fd.ops.mul(t2, c0)
-    t4 = fd.ops.atan2(t3, s0)
-    t5 = fd.ops.relu(t4)
-    t6 = fd.ops.sum(t5, [-1], False, DataType.Float)
-    t7 = fd.ops.isfinite(t6)
-
-    fd.add_output(t6)
-    fd.add_output(t7)
-
-fusion.print_ir()
-
-# Execute Fusion
-input1 = torch.ones(2, 4, 8, device='cuda')
-input2 = torch.ones(2, 4, 8, device='cuda')
-
-# Kernel compilation should be cached for the 2nd iteration
-# with input tensors of the same shape
-for _ in range(5) :
-    outputs = fusion.execute([input1, input2, 2.0])
-
-print(outputs[0])
-print(outputs[1])
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example_broadcast_in_dim.py b/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example_broadcast_in_dim.py
deleted file mode 100644
index 06733dbd68de..000000000000
--- a/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example_broadcast_in_dim.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import torch
-
-from torch._C._nvfuser import Fusion, FusionDefinition
-import torch._prims as prims
-import torch._refs as refs
-
-# Construct and Define Fusion
-fusion1 = Fusion()
-
-with FusionDefinition(fusion1) as fd :
-    t0 = fd.define_tensor(1)
-    t1 = fd.define_tensor(3)
-
-    t0_b = fd.ops.broadcast_in_dim(t0, [2, 3, 4], [1])
-    t2 = fd.ops.add(t0_b, t1)
-
-    fd.add_output(t2)
-
-fusion1.print_ir()
-
-# Execute Fusion
-input1 = torch.randn(3, device='cuda')
-input2 = torch.randn(2, 3, 4, device='cuda')
-
-# Kernel compilation should be cached for the 2nd iteration
-# with input tensors of the same shape
-for _ in range(5) :
-    o = fusion1.execute([input1, input2])[0]
-
-assert(o.shape == torch.Size([2, 3, 4]))
-
-# Reference in prim torch
-ref_o = refs.add(prims.broadcast_in_dim(input1, [2, 3, 4], [1]), input2)
-assert(ref_o.allclose(o))
-assert(ref_o.shape == o.shape)
-
-fusion2 = Fusion()
-
-input1 = torch.randn(1, 1, 4, device='cuda')
-input2 = torch.randn(2, 3, 4, device='cuda')
-
-with FusionDefinition(fusion2) as fd :
-    t0 = fd.define_tensor(sizes=input1.size(), strides=input1.stride())
-    t1 = fd.define_tensor(sizes=input2.size(), strides=input2.stride())
-
-    t0_b = fd.ops.broadcast_in_dim(t0, [2, 3, 4], [0, 1, 2])
-    t2 = fd.ops.add(t0_b, t1)
-
-    fd.add_output(t2)
-
-fusion2.print_ir()
-
-# Kernel compilation should be cached for the 2nd iteration
-# with input tensors of the same shape
-for _ in range(5) :
-    o = fusion2.execute([input1, input2])[0]
-
-assert(o.shape == torch.Size([2, 3, 4]))
-
-# Reference in prim torch
-ref_o = refs.add(prims.broadcast_in_dim(input1, [2, 3, 4], [0, 1, 2]), input2)
-assert(ref_o.allclose(o))
-assert(ref_o.shape == o.shape)
-
-# Construct and Define Fusion
-fusion3 = Fusion()
-
-with FusionDefinition(fusion3) as fd :
-    # t0 = fd.define_tensor(2)
-    t0 = fd.define_tensor([3, 1], [1, 1])
-    t1 = fd.define_tensor(1)
-
-    t1_b = fd.ops.broadcast_in_dim(t1, [3, 3], [0])  # 1 -> 0
-    t2 = fd.ops.add(t0, t1_b)
-
-    fd.add_output(t2)
-
-fusion3.print_ir()
-
-# Execute Fusion
-input1 = torch.randn(3, 1, device='cuda')
-input2 = torch.randn(3, device='cuda')
-
-# Kernel compilation should be cached for the 2nd iteration
-# with input tensors of the same shape
-for _ in range(5) :
-    o = fusion3.execute([input1, input2])[0]
-
-assert(o.shape == torch.Size([3, 3]))
-
-# Reference in prim torch
-ref_o = refs.add(input1, prims.broadcast_in_dim(input2, [3, 3], [0]))
-assert(ref_o.allclose(o))
-assert(ref_o.shape == o.shape)
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example_fp16.py b/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example_fp16.py
deleted file mode 100644
index 55fc2585c22c..000000000000
--- a/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example_fp16.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import torch
-
-from torch._C._nvfuser import Fusion, FusionDefinition, DataType
-
-# Construct and Define Fusion
-fusion = Fusion()
-
-with FusionDefinition(fusion) as fd :
-    t0 = fd.define_tensor(3, DataType.Half)
-    t1 = fd.define_tensor(1, DataType.Half)
-    s0 = fd.define_scalar()
-
-    c0 = fd.define_constant(3.0)
-
-    t2 = fd.ops.add(t0, t1)
-    t3 = fd.ops.mul(t2, c0)
-    t4 = fd.ops.mul(t3, s0)
-    t5 = fd.ops.relu(t4)
-    t6 = fd.ops.sum(t5, [-1], False, DataType.Float)
-
-    t7 = fd.ops.cast(t6, DataType.Half)
-    fd.add_output(t7)
-
-fusion.print_ir()
-
-# Execute Fusion
-input1 = torch.ones(2, 4, 8, device='cuda', dtype=torch.float16)
-input2 = torch.ones(8, device='cuda', dtype=torch.float16)
-
-# Kernel compilation should be cached for the 2nd iteration
-# with input tensors of the same shape
-for _ in range(5) :
-    outputs = fusion.execute([input1, input2, 2.0])
-
-print(outputs[0])
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.cpp b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.cpp
new file mode 100644
index 000000000000..46a91c7a9d96
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.cpp
@@ -0,0 +1,142 @@
+#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.h>
+#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h>
+#include <mutex>
+
+namespace nvfuser {
+
+static std::mutex fusion_cache_lock;
+FusionCache* FusionCache::singleton_ = nullptr;
+
+FusionCacheEntry::FusionCacheEntry(RecordFunctor* rec, size_t _fusion_id)
+    : record(rec), record_hash_map(), fusion_id(_fusion_id), visits(0) {}
+
+bool FusionCacheEntry::isTerminal() const {
+  return (record.get()->recordType() == RecordType::End);
+}
+
+FusionCache* FusionCache::get(size_t max_fusions) {
+  std::lock_guard<std::mutex> guard(fusion_cache_lock);
+  if (singleton_ == nullptr) {
+    singleton_ = new FusionCache(max_fusions);
+  }
+  return singleton_;
+}
+
+size_t FusionCache::numFusions() const {
+  return fusions_.size();
+}
+
+void FusionCache::print(std::ostream& os) {
+  os << "Total Fusions: " << fusions_.size() << "\n";
+
+  // Does not make sense to print stats if the cache is disabled.
+  if (fusions_.size() > 0) {
+    os << "Cache Hits by Fusion Id:\n";
+    auto total_cache_hits = 0;
+    for (size_t i = 0; i < terminal_cache_entries_.size(); ++i) {
+      // The first visit is a miss!
+      auto visits = terminal_cache_entries_[i]->visits - 1;
+      total_cache_hits += visits;
+      os << "\t" << i << " -> " << visits << " hits\n";
+    }
+
+    auto hit_rate = static_cast<float>(total_cache_hits) /
+        static_cast<float>(fusion_cache_start_->visits) * 100.0;
+    os << "Cache Lookups: " << fusion_cache_start_->visits;
+    os << " Cache Hits: " << total_cache_hits;
+    os << " Hit Rate: " << hit_rate << "%\n";
+  }
+}
+
+FusionCache::FusionCache(size_t max_fusions)
+    : max_fusions_(max_fusions),
+      fusion_cache_start_(nullptr),
+      fusion_cache_ptr_(nullptr),
+      fusions_() {
+  RecordFunctor* start = new StartRecord();
+  fusion_cache_start_ = std::make_unique<FusionCacheEntry>(start);
+  fusion_cache_ptr_ = fusion_cache_start_.get();
+}
+
+c10::optional<FusionCacheEntry*> FusionCache::lookupFusionCacheEntry(
+    RecordFunctor* rec) const {
+  TORCH_CHECK(
+      !fusionCachePtr()->isTerminal(),
+      "There should be no children from a Terminal Cache Entry!");
+  TORCH_CHECK(rec, "Record is null!");
+  auto cache_entry = fusionCachePtr()->record_hash_map.find(rec);
+  if (cache_entry == std::end(fusionCachePtr()->record_hash_map)) {
+    return c10::nullopt;
+  } else {
+    return c10::optional<FusionCacheEntry*>(cache_entry->second.get());
+  }
+}
+
+c10::optional<size_t> FusionCache::createFusionCacheEntry(RecordFunctor* rec) {
+  c10::optional<size_t> result = c10::nullopt;
+  TORCH_CHECK(
+      !fusionCachePtr()->isTerminal(),
+      "Cannot create a cache entry from a terminal entry!");
+  TORCH_CHECK(rec, "Record is null!");
+
+  size_t fusion_id = 0;
+  if (rec->recordType() == RecordType::End) {
+    TORCH_CHECK(
+        (fusions_.size() + 1) <= max_fusions_,
+        "The number of fusions in nvfuser has exceeded ",
+        max_fusions_,
+        "fusions.  The max_fusions for the FusionCache might need to be ",
+        "increased if the max number is not being exceeded due to an error.");
+    fusions_.push_back(std::make_unique<Nvf::FusionExecutorCache>(
+        std::make_unique<Nvf::Fusion>()));
+    fusion_id = fusions_.size() - 1;
+    result = c10::optional<size_t>(fusion_id);
+  }
+
+  // Copying the record owned by the FusionDefinition that calls this function
+  // so the cache owns a copy when the FusionDefinition gets destroyed rather
+  // than managing a shared pointer that would  only share with
+  // FusionDefinition that creates a cache entry but not cache lookups
+  RecordFunctor* new_rec = rec->clone();
+  fusionCachePtr()->record_hash_map[new_rec] =
+      std::make_unique<FusionCacheEntry>(new_rec, fusion_id);
+  if (rec->recordType() == RecordType::End) {
+    terminal_cache_entries_.push_back(
+        fusionCachePtr()->record_hash_map[new_rec].get());
+  }
+  if (Nvf::isDebugDumpEnabled(Nvf::DebugDumpOption::PythonFrontendDebug)) {
+    std::stringstream ss;
+    new_rec->print(ss);
+    std::cout << "\nFusionDefinition: Create new cache entry for: " << ss.str()
+              << "\n";
+  }
+  return result;
+}
+
+void FusionCache::resetFusionCachePtr() {
+  fusion_cache_ptr_ = fusion_cache_start_.get();
+  TORCH_CHECK(fusionCachePtr()->record->recordType() == RecordType::Start);
+  ++(fusionCachePtr()->visits);
+}
+
+void FusionCache::traverseFusionCache(RecordFunctor* rec) {
+  TORCH_CHECK(
+      !fusionCachePtr()->isTerminal(),
+      "Cannot traverse cache from a terminal entry!");
+  auto cache_entry = fusionCachePtr()->record_hash_map.find(rec);
+  TORCH_CHECK(
+      cache_entry != std::end(fusionCachePtr()->record_hash_map),
+      "Cache Entry for Cache Traverse is not found!");
+  TORCH_CHECK(cache_entry->second, "Record in Cache Entry is null!");
+  fusion_cache_ptr_ = cache_entry->second.get();
+  ++(fusionCachePtr()->visits);
+}
+
+FusionCacheEntry* FusionCache::fusionCachePtr() const {
+  TORCH_INTERNAL_ASSERT(
+      fusion_cache_ptr_ != nullptr,
+      "The fusion cache entry is unexpectedly null.");
+  return fusion_cache_ptr_;
+}
+
+} // namespace nvfuser
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.h b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.h
new file mode 100644
index 000000000000..30cc7fa113a4
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.h
@@ -0,0 +1,109 @@
+#pragma once
+#include <c10/macros/Export.h>
+
+#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
+#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h>
+
+#include <memory>
+
+//! nvFuser Fusion IR namespace abbreviation
+namespace Nvf = torch::jit::fuser::cuda;
+
+namespace nvfuser {
+
+struct RecordFunctor;
+
+//! \struct FusionCacheEntry
+//! \brief Is the container for a Node in the cache contained in the
+//! FusionCache that is organized as a prefix tree.
+
+struct TORCH_CUDA_CU_API FusionCacheEntry {
+  FusionCacheEntry(RecordFunctor* rec, size_t _fusion_id = 0);
+
+  // Queries whether the entry denotes a leaf node which also represents
+  // a the end of Fusion entry in the cache.
+  bool isTerminal() const;
+
+  //! An entry's primary data is the record it holds
+  std::unique_ptr<RecordFunctor> record;
+  //! A hash map of the children for the current node.
+  //! The hash map hashs a pointer to a RecordFunctor because
+  //! the hash function is virtual.
+  std::unordered_map<RecordFunctor*, std::unique_ptr<FusionCacheEntry>>
+      record_hash_map;
+  //! An index into FusionCache's vector of nvFuser object that holds an
+  //! unscheduled Fusion.  The id is only valid if the entry is terminal.
+  size_t fusion_id;
+  //! Count of times the Entry is traversed
+  size_t visits;
+};
+
+//! \class FusionCache
+//! \brief A singleton class used in the nvFuser python interface
+//! to manage the caching of fusions.
+//!
+//! The fusion cache implements a prefix tree of records in order to cache
+//! fusions.  A leaf of the tree with a terminal node contains an nvFuser
+//! Fusion IR container for a cached instance.
+//!
+//! \todo Add the ability to evict a fusion.  There is currently a max number
+//! of fusions that is checked to prevent a runaway case.
+
+class TORCH_CUDA_CU_API FusionCache {
+  //! The constructor is private given the FusionCache is only constructed
+  //! as a singleton.
+  FusionCache(size_t max_fusions);
+
+  //! Copy and Assignment of the FusionCache is not supported
+  FusionCache(const FusionCache&) = delete;
+  FusionCache& operator=(const FusionCache&) = delete;
+
+ public:
+  //! The next 2 pubic methods are the python interface methods
+
+  //! Gets a pointer to the singleton and creates a new one if necessary
+  static FusionCache* get(size_t max_fusions = 8192);
+  //! Number of fusions cached
+  size_t numFusions() const;
+  //! print cache stats
+  void print(std::ostream& os);
+
+  //! The rest of the public methods are only used in C++
+
+  //! Queries the current cache entry to see if a record matches one of its
+  //! children
+  c10::optional<FusionCacheEntry*> lookupFusionCacheEntry(
+      RecordFunctor* rec) const;
+  //! Creates a child node for the current cache entry and an optional
+  //! fusion_id is returned if the new entry is terminal
+  c10::optional<size_t> createFusionCacheEntry(RecordFunctor* rec);
+  //! Resets the current cache pointer to the top of the tree
+  void resetFusionCachePtr();
+  //! Traverses the cache from the current entry to the child associated
+  //! with the record given.
+  void traverseFusionCache(RecordFunctor* rec);
+
+  friend class FusionInterface;
+
+ private:
+  //! Returns the pointer to the current cache entry
+  FusionCacheEntry* fusionCachePtr() const;
+
+  //! The static pointer to the FusionCache
+  static FusionCache* singleton_;
+
+  //! The max allowed number of fusions in the cache
+  size_t max_fusions_;
+  //! The top of the prefix tree used to start a cache look up of a given
+  //! fusion definition.
+  std::unique_ptr<FusionCacheEntry> fusion_cache_start_;
+  //! A pointer to the current cache entry in a cache lookup of a fusion
+  //! definition.
+  FusionCacheEntry* fusion_cache_ptr_;
+  //! A vector of nvFuser Fusion IR fusions.
+  std::vector<std::unique_ptr<Nvf::FusionExecutorCache>> fusions_;
+  //! A vector of Terminal Cache Entries for Stats collection
+  std::vector<FusionCacheEntry*> terminal_cache_entries_;
+};
+
+} // namespace nvfuser
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.cpp b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.cpp
index 4efdc21526ba..cf467d9ae5ca 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.cpp
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.cpp
@@ -1,65 +1,186 @@
-#ifdef USE_CUDA
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.h>
 #include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.h>
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h>
+#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.h>
+#include <torch/csrc/jit/codegen/cuda/utils.h>
+
+// Require namespace for perf scope instrumentation
+using namespace torch::jit::fuser::cuda::inst;
 
 namespace nvfuser {
 
-FusionDefinition::FusionDefinition(FusionOwner* fusion_owner)
-    : fusion_owner_(fusion_owner),
-      prev_fusion_(nullptr),
+const char* dtypeToPyString(Nvf::DataType t) {
+  switch (t) {
+    case Nvf::DataType::Bool:
+      return "DataType.Bool";
+    case Nvf::DataType::Double:
+      return "DataType.Double";
+    case Nvf::DataType::Float:
+      return "DataType.Float";
+    case Nvf::DataType::Half:
+      return "DataType.Half";
+    case Nvf::DataType::BFloat16:
+      return "DataType.Bfloat16";
+    case Nvf::DataType::Int:
+      return "DataType.Int";
+    case Nvf::DataType::Int32:
+      return "DataType.Int32";
+    case Nvf::DataType::ComplexFloat:
+      return "DataType.ComplexFloat";
+    case Nvf::DataType::ComplexDouble:
+      return "DataType.ComplexDouble";
+    case Nvf::DataType::Null:
+      return "DataType.Null";
+    default:
+      break;
+  }
+  TORCH_INTERNAL_ASSERT(false, "No string found for data type.");
+  return nullptr;
+}
+
+FusionDefinition::FusionDefinition(FusionInterface* fusion, size_t max_length)
+    : max_length_(max_length),
+      fusion_(fusion),
+      fusion_cache_(FusionCache::get()),
+      end_record_(new EndRecord()),
       recording_(),
       recording_state_(),
       fusion_state_(),
       ops(this) {}
 
-FusionDefinition* FusionDefinition::enter() {
-  prev_fusion_ = FusionGuard::getCurFusion();
-  FusionGuard::setCurFusion(fusionPtr());
-  return this;
-}
-void FusionDefinition::exit() {
+void FusionDefinition::buildFusionIr() {
+  FUSER_PERF_SCOPE("FusionDefinition::buildFusionIr");
+  auto fusion_guard = fusionInterfacePtr()->guard();
   fusion_state_.resize(recording_state_.size(), nullptr);
   for (auto& record : recording_) {
     auto functor = record.get();
     (*functor)(*this);
   }
+}
+
+FusionCache* FusionDefinition::fusionCachePtr() const {
+  TORCH_INTERNAL_ASSERT(
+      fusion_cache_ != nullptr, "FusionCache pointer is null!");
+  return fusion_cache_;
+}
 
-  FusionGuard::setCurFusion(prev_fusion_);
-  prev_fusion_ = nullptr;
+FusionInterface* FusionDefinition::fusionInterfacePtr() const {
+  TORCH_INTERNAL_ASSERT(fusion_ != nullptr, "FusionInterface pointer is null!");
+  return fusion_;
+}
+
+FusionDefinition* FusionDefinition::enter() {
+  TORCH_CHECK(max_length_ > 0, "Can't make a FusionDefinition with 0 records!");
+  TORCH_CHECK(
+      !fusionInterfacePtr()->defined(), "Fusion Interface is already defined!");
+  fusionCachePtr()->resetFusionCachePtr();
+  return this;
 }
 
-Scalar* FusionDefinition::defineScalar() {
-  Scalar* out = new nvfuser::Scalar(recording_state_.size());
-  recording_state_.emplace_back(out);
+void FusionDefinition::exit() {
+  FUSER_PERF_SCOPE("FusionDefinition::exit");
+  auto cache_entry =
+      fusionCachePtr()->lookupFusionCacheEntry(end_record_.get());
+  if (!cache_entry.has_value()) {
+    if (Nvf::isDebugDumpEnabled(Nvf::DebugDumpOption::PythonFrontendDebug)) {
+      std::cout << "\nFusionDefinition: Terminal Node not found.\n";
+    }
+    auto fusion_id =
+        fusionCachePtr()->createFusionCacheEntry(end_record_.get());
+    TORCH_CHECK(fusion_id.has_value(), "Invalid fusion id!");
+    fusionInterfacePtr()->define(fusion_id.value());
+    fusionCachePtr()->traverseFusionCache(end_record_.get());
+
+    if (Nvf::isDebugDumpEnabled(Nvf::DebugDumpOption::PythonDefinition)) {
+      print(std::cout);
+    }
+
+    buildFusionIr();
+
+    if (Nvf::isDebugDumpEnabled(Nvf::DebugDumpOption::FusionIrPresched)) {
+      fusionInterfacePtr()->print();
+    }
+  } else {
+    if (Nvf::isDebugDumpEnabled(Nvf::DebugDumpOption::PythonFrontendDebug)) {
+      std::cout << "\nFusionDefinition: Terminal Node found!\n";
+    }
+    fusionInterfacePtr()->define(cache_entry.value()->fusion_id);
+    fusionCachePtr()->traverseFusionCache(end_record_.get());
+  }
+}
+
+void FusionDefinition::print(std::ostream& os) const {
+  os << "\ndef nvfuser_fusion_id" << fusion_->id();
+  os << "(fd : FusionDefinition) -> None :\n";
+  os << std::dec;
+  for (auto& rec : recording_) {
+    os << "    ";
+    rec->print(os);
+    os << "\n";
+  }
+  os << "\n";
+}
+
+Scalar FusionDefinition::defineScalar() {
+  FUSER_PERF_SCOPE("FusionDefinition::defineScalar");
+  Scalar out(recording_state_.size());
+  recording_state_.emplace_back(out(), StateType::Scalar);
   return out;
 }
-Tensor* FusionDefinition::defineTensor() {
-  Tensor* out = new nvfuser::Tensor(recording_state_.size());
-  recording_state_.emplace_back(out);
+
+Tensor FusionDefinition::defineTensor() {
+  FUSER_PERF_SCOPE("FusionDefinition::defineTensor");
+  Tensor out(recording_state_.size());
+  recording_state_.emplace_back(out(), StateType::Tensor);
   return out;
 }
+
 void FusionDefinition::defineRecord(RecordFunctor* record) {
+  FUSER_PERF_SCOPE("FusionDefinition::defineRecord");
+  TORCH_CHECK(
+      (recording_.size() + 1) <= max_length_,
+      "The fusion definition has exceeded ",
+      max_length_,
+      "operations.  The max_length for FusionDefintion's might need to be ",
+      "increased if the definition is created as expected.");
   recording_.emplace_back(record);
+  auto cache_entry =
+      fusionCachePtr()->lookupFusionCacheEntry(recording_.back().get());
+  // If the Record is found in the cache, the FusionDefinition and the Cache
+  // will not share Record given the Record had to be created in order to
+  // match it but it also already existed in the cache.
+  if (cache_entry.has_value()) {
+    if (Nvf::isDebugDumpEnabled(Nvf::DebugDumpOption::PythonFrontendDebug)) {
+      std::cout << "\nFusionDefinition: Record (hash: 0x" << std::hex
+                << record->hash() << ") hit in Fusion Cache.\n";
+    }
+    // The FusionDefinition and the Cache will share the Record
+  } else {
+    if (Nvf::isDebugDumpEnabled(Nvf::DebugDumpOption::PythonFrontendDebug)) {
+      std::cout << "\nFusionDefinition: Record (hash: 0x" << std::hex
+                << record->hash() << ") missed in Fusion Cache.\n";
+    }
+    fusionCachePtr()->createFusionCacheEntry(recording_.back().get());
+  }
+  fusionCachePtr()->traverseFusionCache(recording_.back().get());
 }
 
-void FusionDefinition::addInput(NvfVal* input) {
-  fusionPtr()->addInput(input);
+void FusionDefinition::addInput(Nvf::Val* input) {
+  fusionInterfacePtr()->addInput(input);
 }
-void FusionDefinition::addOutput(NvfVal* output) {
-  fusionPtr()->addOutput(output);
+void FusionDefinition::addOutput(Nvf::Val* output) {
+  fusionInterfacePtr()->addOutput(output);
 }
 
-NvfVal* FusionDefinition::getFusionState(size_t index) const {
+Nvf::Val* FusionDefinition::getFusionState(size_t index) const {
   return fusion_state_.at(index);
 }
-void FusionDefinition::setFusionState(size_t index, NvfVal* val) {
+void FusionDefinition::setFusionState(size_t index, Nvf::Val* val) {
   fusion_state_.at(index) = val;
 }
 
-Fusion* FusionDefinition::fusionPtr() {
-  return fusion_owner_->fusionPtr();
+State FusionDefinition::recordingState(size_t index) const {
+  return recording_state_.at(index);
 }
 
 } // namespace nvfuser
-
-#endif // USE_CUDA
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.h b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.h
index a5aca2f0d250..68723813ea2c 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.h
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.h
@@ -1,21 +1,24 @@
 #pragma once
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <c10/macros/Export.h>
+
 #include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_owner.h>
 
-//! nvFuser Fusion IR Types
-using NvfDataType = torch::jit::fuser::cuda::DataType;
-using NvfFusion = torch::jit::fuser::cuda::Fusion;
-using NvfTensorView = torch::jit::fuser::cuda::TensorView;
-using NvfVal = torch::jit::fuser::cuda::Val;
+//! nvFuser Fusion IR namespace abbreviation
+namespace Nvf = torch::jit::fuser::cuda;
 
 namespace nvfuser {
 
+class FusionCache;
+class FusionInterface;
 struct RecordFunctor;
 
-//! The State, child classes Tensor and Scalar, and the StateType enum
-//! are used to define state objects to encapsulate the recording of state
-//! in the FusionDefinition.
+//! This is helper function used to print a python formated
+//! Fusion IR DataType when printing a fusion definition.
+
+TORCH_CUDA_CU_API const char* dtypeToPyString(Nvf::DataType t);
+
+//! The State and the StateType enum are used to define state objects to
+//! encapsulate the recording of state in the FusionDefinition.
 
 enum class StateType {
   Tensor,
@@ -24,15 +27,15 @@ enum class StateType {
 };
 
 struct State {
-  State(StateType _stype, size_t _index) : stype(_stype), index(_index) {}
+  State(size_t _index, StateType _stype) : index(_index), stype(_stype) {}
 
-  //! StateType is either: Tensor or Scalar
-  StateType stype;
   //! A unique index to identifiy each recorded state item.
   size_t index;
+  //! StateType is either: Tensor or Scalar
+  StateType stype;
 };
 
-//! The child classes are used to define separate function signtures in
+//! The Tensor and Scalar classes are used to define separate function signtures
 //! in the FusionDefintion to identify the appropriate Operator function.
 //!
 //! Example:
@@ -40,12 +43,26 @@ struct State {
 //!   add(Tensor* arg1, Tensor* arg2) -> Tensor*
 //!   add(Tensor* arg1, Scalar* arg2) -> Tensor*
 //!   add(Scalar* arg1, Scalar* arg2) -> Scalar*
-struct Tensor : State {
-  Tensor(size_t _index) : State(StateType::Tensor, _index) {}
+struct Tensor {
+  Tensor(size_t _index) : index(_index) {}
+
+  size_t operator()() const {
+    return index;
+  }
+
+  //! A unique index to identifiy each recorded state item.
+  size_t index;
 };
 
-struct Scalar : State {
-  Scalar(size_t _index) : State(StateType::Scalar, _index) {}
+struct Scalar {
+  Scalar(size_t _index) : index(_index) {}
+
+  size_t operator()() const {
+    return index;
+  }
+
+  //! A unique index to identifiy each recorded state item.
+  size_t index;
 };
 
 //! FusionDefinition defines the C++ side of a Python Context manager to
@@ -56,17 +73,14 @@ struct Scalar : State {
 //! in a cache and the recorded records are used to build an nvFuser Fusion
 //! object if the definition missed in the cache.
 //!
-//! \todo Need to implement the cache portion. Currently, the Fusion object
-//! is always built.
-//!
 //! The nested Operators class was designed to allow the user to query all the
 //! available Operators in the FusionDefinition via python help.
 //!
 //! Example:
 //!   help(FusionDefinition.Operators)
-class FusionDefinition {
+class TORCH_CUDA_CU_API FusionDefinition {
  public:
-  FusionDefinition(FusionOwner* fusion_owner);
+  FusionDefinition(FusionInterface* fusion, size_t max_length = 256);
 
   // The copy/move/assign constructors/operators are being removed
   // because it is not possible to copy the fusion_recording data member
@@ -81,46 +95,60 @@ class FusionDefinition {
   FusionDefinition* enter();
   //! Exit Python Context Manager -- Triggers cache lookup
   void exit();
+  //! Prints a python function representing the definition
+  void print(std::ostream& os) const;
 
   //! These methods are used to record the FusionDefinition for cache lookup
 
   //! Defines a Scalar State Record
-  Scalar* defineScalar();
+  Scalar defineScalar();
   //! Defines a Tensor State Record
-  Tensor* defineTensor();
+  Tensor defineTensor();
   //! Defines a Record that records the operation required to
   //! build the corresponding Fusion IR operation on cache miss.
   void defineRecord(RecordFunctor* record);
-
-  //! These methods are used to replay the operations for building the
-  //! nvFuser Fusion IR on a cache miss.
-
   //! Adds a Tensor/Scalar input to the Fusion object
-  void addInput(NvfVal* input);
+  void addInput(Nvf::Val* input);
   //! Adds a Tensor/Scalar output to the Fusion object
-  void addOutput(NvfVal* output);
+  void addOutput(Nvf::Val* output);
   //! Gets a Fusion IR Tensor/Scalar object
-  NvfVal* getFusionState(size_t index) const;
+  Nvf::Val* getFusionState(size_t index) const;
   //! Sets a Fusion IR Tensor/Scalar object
-  void setFusionState(size_t index, NvfVal* val);
-
-  //! A pointer to the nvFuser Fusion IR Oject
-  NvfFusion* fusionPtr();
+  void setFusionState(size_t index, Nvf::Val* val);
+  //! Gets a Record State object
+  State recordingState(size_t index) const;
 
  private:
-  // \todo These items will be replaced by a FusionManager instead of a cache
-  // for an individual fusion object
-  FusionOwner* fusion_owner_;
-  NvfFusion* prev_fusion_;
+  //! Builds an nvFuser Fusion IR object upon exit of a FusionDefintion
+  //! when a cache lookup fails.
+  void buildFusionIr();
+  //! Returns the FusionCache Ptr that holds the cache of Fusions
+  FusionCache* fusionCachePtr() const;
+  //! Returns the FusionInterface Ptr that represents the corresponding
+  //! Fusion IR object.
+  FusionInterface* fusionInterfacePtr() const;
+
+  //! Holds the defined maximum length of a FusionDefinition in order to
+  //! prevent a run away error. The user should feel free to increase this
+  //! number as appropriate.
+  size_t max_length_;
+
+  //! A pointer to an interface for an nvFusion Fusion IR object.
+  FusionInterface* fusion_;
+  //! A pointer to the FusionCache.
+  FusionCache* fusion_cache_;
+
+  //! Holds an End Record
+  std::unique_ptr<RecordFunctor> end_record_;
 
   //! A vector of record operations in the FusionDefintion
   std::vector<std::unique_ptr<RecordFunctor>> recording_;
-  //! A vector of state (Tensor/Scalar) recorded in the FusionDefinition
-  std::vector<std::unique_ptr<State>> recording_state_;
+  //! A vector of state recorded in the FusionDefinition
+  std::vector<State> recording_state_;
 
   //! A vector of nvFuser Fusion IR TensorViews/Vals for building the Fusion
   //! IR graph.
-  std::vector<NvfVal*> fusion_state_;
+  std::vector<Nvf::Val*> fusion_state_;
 
  public:
   //! The Operators are not directly defined in this header.  They are defined
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.cpp b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.cpp
new file mode 100644
index 000000000000..d1d33dd0fd51
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.cpp
@@ -0,0 +1,60 @@
+#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.h>
+#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.h>
+
+namespace nvfuser {
+
+FusionInterface::FusionInterface() : fusion_id_(c10::nullopt) {}
+FusionInterface::FusionInterface(size_t fusion_id)
+    : fusion_id_(c10::optional<size_t>(fusion_id)) {}
+
+void FusionInterface::define(size_t fusion_id) {
+  auto fc = FusionCache::get();
+  TORCH_CHECK(fusion_id < fc->fusions_.size(), "Invalid fusion id!");
+  fusion_id_ = c10::optional<size_t>(fusion_id);
+}
+
+bool FusionInterface::defined() const {
+  return fusion_id_.has_value();
+}
+
+size_t FusionInterface::id() const {
+  TORCH_CHECK(defined(), "Invalid fusion id!");
+  return fusion_id_.value();
+}
+
+void FusionInterface::addInput(Nvf::Val* input) const {
+  fusionPtr()->addInput(input);
+}
+
+void FusionInterface::addOutput(Nvf::Val* output) const {
+  fusionPtr()->addOutput(output);
+}
+
+std::vector<at::Tensor> FusionInterface::execute(
+    const at::ArrayRef<c10::IValue>& inputs) const {
+  return fusionExecutorCachePtr()->runFusionWithInputs(inputs);
+}
+
+Nvf::FusionGuard FusionInterface::guard() const {
+  return Nvf::FusionGuard(fusionPtr());
+}
+
+void FusionInterface::print() const {
+  fusionExecutorCachePtr()->printFusion();
+}
+
+Nvf::FusionExecutorCache* FusionInterface::fusionExecutorCachePtr() const {
+  auto fc = FusionCache::get();
+  TORCH_CHECK(defined(), "Invalid fusion id!");
+  TORCH_CHECK(
+      fc->fusions_.at(fusion_id_.value()), "FusionExecutorCache Ptr is Null!");
+  return fc->fusions_.at(fusion_id_.value()).get();
+}
+
+Nvf::Fusion* FusionInterface::fusionPtr() const {
+  auto fusion_ptr = fusionExecutorCachePtr()->fusion();
+  TORCH_CHECK(fusion_ptr != nullptr, "Fusion IR pointer is null!");
+  return fusion_ptr;
+}
+
+} // namespace nvfuser
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.h b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.h
new file mode 100644
index 000000000000..60d55f16104f
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.h
@@ -0,0 +1,72 @@
+#pragma once
+#include <c10/macros/Export.h>
+
+#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
+
+//! nvFuser Fusion IR namespace abbreviation
+namespace Nvf = torch::jit::fuser::cuda;
+
+namespace nvfuser {
+
+//! \class FusionInterface
+//! \brief Implements an Interface that represents an nvFuser IR object in
+//! in python.
+//!
+//! Example 1 - Define fusion:
+//!
+//!   fs = Fusion()
+//!   with FusionDefinition(fs) as fd :
+//!       t0 = fd.define_tensor(3)
+//!       s1 = fd.define_constant(3.)
+//!       t2 = fd.ops.add(t0, s1)
+//!       fd.add_output(t2)
+//!
+//!   input = torch.ones(2, 4, 8, device='cuda')
+//!   for _ in range(5) :
+//!      outputs = fs.execute([input])
+//!
+//! Example 2 - Use cached fusion, directly, based on id:
+//!
+//!   fs = Fusion(fusion_id)
+//!
+//!   input = torch.ones(2, 4, 8, device='cuda')
+//!   for _ in range(5) :
+//!      outputs = fs.execute([input])
+
+class TORCH_CUDA_CU_API FusionInterface {
+ public:
+  //! Pybind11 cannot bind to c10::optional and Pytorch is compiled with C++14.
+  //! Therefore, I am adding two constructors, instead.
+  FusionInterface();
+  FusionInterface(size_t fusion_id);
+
+  //! Define which Fusion IR object the interface represents
+  void define(size_t fusion_id);
+  //! Query whether the Fusion IR is defined
+  bool defined() const;
+  //! Return fusion id of this Fusion
+  size_t id() const;
+
+  //! Adds an input to the represented Fusion IR.
+  void addInput(Nvf::Val* input) const;
+  //! Adds an Output to the represented Fusion IR.
+  void addOutput(Nvf::Val* output) const;
+  //! Executes a fusion if the current cache pointer points at a terminal node
+  std::vector<at::Tensor> execute(
+      const at::ArrayRef<c10::IValue>& inputs) const;
+  //! Activates a guard around the represented Fusion IR.
+  Nvf::FusionGuard guard() const;
+  //! Prints the represented nvFuser IR
+  void print() const;
+
+ private:
+  //! Provides a pointer to the FusionExecutorCache that maps the current
+  //! unscheduled Fusion IRs to scheduled Fusion IRs for execution.
+  Nvf::FusionExecutorCache* fusionExecutorCachePtr() const;
+  //! Points to the nvFuser Fusion IR object
+  Nvf::Fusion* fusionPtr() const;
+
+  c10::optional<size_t> fusion_id_;
+};
+
+} // namespace nvfuser
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_owner.h b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_owner.h
deleted file mode 100644
index dce8cc4d65d5..000000000000
--- a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_owner.h
+++ /dev/null
@@ -1,36 +0,0 @@
-
-#pragma once
-
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-
-using namespace torch::jit::fuser::cuda;
-
-namespace nvfuser {
-
-class FusionOwner {
- public:
-  FusionOwner() : executor_cache_(std::make_unique<Fusion>()) {}
-
-  // Non-copyable
-  FusionOwner(const FusionOwner&) = delete;
-  FusionOwner& operator=(const FusionOwner&) = delete;
-
-  std::vector<at::Tensor> execute(const at::ArrayRef<c10::IValue>& inputs) {
-    return executor_cache_.runFusionWithInputs(inputs);
-  }
-  Fusion* fusionPtr() {
-    return executor_cache_.fusion();
-  }
-
-  void printIr() {
-    executor_cache_.printFusion();
-  }
-  void printKernel() {
-    executor_cache_.fusion()->printKernel();
-  }
-
- private:
-  FusionExecutorCache executor_cache_;
-};
-
-} // namespace nvfuser
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h
index 4616bd114931..7d25ff59db8e 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h
@@ -4,31 +4,166 @@
 #include <torch/csrc/jit/codegen/cuda/ops/alias.h>
 #include <torch/csrc/jit/codegen/cuda/ops/normalization.h>
 #include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.h>
+#include <torch/csrc/jit/codegen/cuda/utils.h>
 
 namespace nvfuser {
 
+//! This enum it to give a Record Type for record hashing given that the
+//! record type is otherwise determined via the success of dynamic casting.
+//! This means that templated types are not specifically enumerated for
+//! each set of template arguments.
+enum class RecordType {
+  Base = 0,
+  Op,
+  BroadcastOp,
+  CastOp,
+  Constant,
+  End,
+  Tensor,
+  Output,
+  ReductionOp,
+  Scalar,
+  SqueezeOp,
+  Start,
+  VarianceOp,
+  VarianceMeanOp,
+};
+
 //! RecordFunctor is the base class record for operations recorded by
 //! the FusionDefinition.  It is, in essence, a node in the graph with
-//! input edges, args, and outputs edges outputs that where the stored
+//! input edges, args, and outputs edges outputs where the stored
 //! values are indices into the recorded state.
 //!
-//! The virual functor is the operators that is replayed on a cache
-//! to build the appropriate part of the nvFuser Fusion IR for a given
-//! record.
+//! The virual functor operator is executed on a cache miss to build the
+//! appropriate part of the nvFuser Fusion IR for a given record.
+//!
+//! The hash and equality operators are used to facilitate the hashing of
+//! RecordFunctors in a hash map given those operators need to be
+//! specified for custom objects.
+//!
+//! The print function is used to print the given Record as a statement
+//! in a python formated function.
 
 struct RecordFunctor {
-  RecordFunctor(std::vector<size_t> _args, std::vector<size_t> _outputs)
-      : args(std::move(_args)), outputs(std::move(_outputs)) {}
+  RecordFunctor(
+      std::vector<State> _args,
+      std::vector<State> _outputs,
+      std::string _name,
+      RecordType _record_type)
+      : args_(std::move(_args)),
+        outputs_(std::move(_outputs)),
+        name_(std::move(_name)),
+        record_type_(_record_type) {}
   virtual ~RecordFunctor() = default;
+  //! Allows for copying of Child Class objects with RecordFunctor pointers.
+  virtual RecordFunctor* clone() = 0;
+
+  //! The base class is placing the type, outputs, and args hashed as follows:
+  //! | 63 - 56 | 55 - 48 | 47 ----------- 32 | 32 ------------------------  0 |
+  //! | Type    | Outputs | Args              | Child Class Specified          |
+  virtual size_t hash() const {
+    size_t arg_hash = 0;
+    for (auto arg : args_) {
+      arg_hash ^= ((arg.index << 1) ^ static_cast<size_t>(arg.stype));
+    }
+    size_t output_hash = 0;
+    for (auto output : outputs_) {
+      output_hash ^= ((output.index << 1) ^ static_cast<size_t>(output.stype));
+    }
+    return ((static_cast<size_t>(record_type_) & 0xff) << 56) |
+        ((output_hash & 0xff) << 48) | ((arg_hash & 0xffff) << 32);
+  }
+
+  //! The base virtual equality operator is defined so all child
+  //! classes can utilize the check for the same args and outputs.
+  virtual bool operator==(const RecordFunctor& other) const {
+    auto result = (record_type_ == other.record_type_);
+    result = result && (args_.size() == other.args_.size()) &&
+        (outputs_.size() == other.outputs_.size());
+    if (result) {
+      for (size_t i = 0; i < args_.size(); ++i) {
+        if ((args_[i].index != other.args_[i].index) ||
+            (args_[i].stype != other.args_[i].stype)) {
+          result = false;
+          break;
+        }
+      }
+    }
+    if (result) {
+      for (size_t i = 0; i < outputs_.size(); ++i) {
+        if ((outputs_[i].index != other.outputs_[i].index) ||
+            (outputs_[i].stype != other.outputs_[i].stype)) {
+          result = false;
+          break;
+        }
+      }
+    }
+    return result;
+  }
 
   //! Abstraction for an operation to build this record's nvFuser Fusion IR
   //! piece if the recording has a cache miss.
   virtual void operator()(FusionDefinition& fd) = 0;
 
+  //! The base print function when printing Record for a given FusionDefinition
+  //! in python formated code.
+  virtual void print(std::ostream& os, bool close_function = true) const {
+    bool first_output = true;
+    for (auto& output : outputs_) {
+      if (first_output) {
+        first_output = false;
+      } else {
+        os << ", ";
+      }
+      if (output.stype == StateType::Scalar) {
+        os << "S";
+      } else if (output.stype == StateType::Tensor) {
+        os << "T";
+      } else {
+        TORCH_INTERNAL_ASSERT(false, "Unsupported StateType");
+      }
+      os << output.index;
+    }
+    if (outputs_.size() > 0) {
+      os << " = "
+         << "fd." << name_ << "(";
+    } else {
+      os << "fd." << name_ << "(";
+    }
+    bool first_arg = true;
+    for (auto& arg : args_) {
+      if (first_arg) {
+        first_arg = false;
+      } else {
+        os << ", ";
+      }
+      if (arg.stype == StateType::Scalar) {
+        os << "S";
+      } else if (arg.stype == StateType::Tensor) {
+        os << "T";
+      } else {
+        TORCH_INTERNAL_ASSERT(false, "Unsupported StateType");
+      }
+      os << arg.index;
+    }
+    if (close_function) {
+      os << ")";
+    }
+  }
+
+  RecordType recordType() const {
+    return record_type_;
+  }
+
+ protected:
   //! Inputs that are indices into the FusionDefinition's Recorded State.
-  std::vector<size_t> args;
+  std::vector<State> args_;
   //! Outputs that are indices into the FusionDefinition's Recorded State.
-  std::vector<size_t> outputs;
+  std::vector<State> outputs_;
+  //! Record Name
+  std::string name_;
+  //! Record Type of child class used for hashing
+  RecordType record_type_;
 };
 
 //! The OpRecord RecordFunctor is the most widely used child class because
@@ -43,12 +178,65 @@ struct RecordFunctor {
 template <class OutType, class... ArgTypes>
 struct OpRecord : RecordFunctor {
   OpRecord(
-      std::vector<size_t> _args,
-      std::vector<size_t> _outputs,
+      std::vector<State> _args,
+      std::vector<State> _outputs,
+      std::string _name,
       std::function<OutType(ArgTypes...)> fusion_op)
-      : RecordFunctor(std::move(_args), std::move(_outputs)),
+      : RecordFunctor(
+            std::move(_args),
+            std::move(_outputs),
+            _name,
+            RecordType::Op),
         fusion_op_(fusion_op) {}
   virtual ~OpRecord() = default;
+  virtual RecordFunctor* clone() final {
+    return new OpRecord(*this);
+  }
+
+  //! Child specific hash function in lower 32 bits.
+  //! | 31 -------------------------------------  0 |
+  //! | Arith Function Sigs hash code               |
+  virtual size_t hash() const final {
+    auto result = RecordFunctor::hash();
+    return result | (fusion_op_.target_type().hash_code() & 0xffffffff);
+  }
+
+  virtual bool operator==(const RecordFunctor& other) const final {
+    auto result = false;
+    // A succesfull cast indicates a RecordFunctor of the same child class
+    if (auto child_ptr = dynamic_cast<const OpRecord*>(&other)) {
+      result = RecordFunctor::operator==(other);
+      if (result) {
+        // Match the nvFuser arith function types
+        result = result &&
+            (fusion_op_.target_type() == child_ptr->fusion_op_.target_type());
+        if (Nvf::isDebugDumpEnabled(
+                Nvf::DebugDumpOption::PythonFrontendDebug)) {
+          std::cout << "\nOpRecord: " << name_ << " Target Type [self: 0x"
+                    << fusion_op_.target_type().name() << "] [other: 0x"
+                    << child_ptr->fusion_op_.target_type().name() << "] ";
+        }
+        // Match the nvFuser arith function pointers
+        // IMPORTANT! you need to dereference the target pointer in order
+        // to match the function
+        result = result &&
+            (*fusion_op_.template target<OutType (*)(ArgTypes...)>() ==
+             *child_ptr->fusion_op_
+                  .template target<OutType (*)(ArgTypes...)>());
+        if (Nvf::isDebugDumpEnabled(
+                Nvf::DebugDumpOption::PythonFrontendDebug)) {
+          std::cout
+              << "Target  Ptr [self: 0x" << std::hex
+              << (size_t)*fusion_op_.template target<OutType (*)(ArgTypes...)>()
+              << "] [other: 0x" << std::hex
+              << (size_t)*child_ptr->fusion_op_
+                     .template target<OutType (*)(ArgTypes...)>()
+              << "]\n";
+        }
+      }
+    }
+    return result;
+  }
 
   //! The variadic set of indices for the number of args for this op are
   //! deduced by providing the index_sequence as a parameter.  Similarly,
@@ -58,9 +246,9 @@ struct OpRecord : RecordFunctor {
   //! to a Fusion IR TensorView or leave it as a Fusion IR Val (Scalar).
   //!
   //! A deduced binary op could look like:
-  //!   OutType opFunc<std::tuple<NvfTensor*, NvfTensor*>, 0, 1>
+  //!   OutType opFunc<std::tuple<TensorView*, TensorView*>, 0, 1>
   //! A deduced ternary op could look like:
-  //!   OutTupe opFunc<std::tuple<NvfTensor*, NvfVal*, NvfVal*>, 0, 1, 2>
+  //!   OutTupe opFunc<std::tuple<TensorView*, Val*, Val*>, 0, 1, 2>
   template <class TupleType, std::size_t... Is>
   OutType opFunc(
       FusionDefinition& fd,
@@ -68,17 +256,17 @@ struct OpRecord : RecordFunctor {
       std::index_sequence<Is...>) {
     return fusion_op_(
         dynamic_cast<typename std::tuple_element<Is, TupleType>::type>(
-            fd.getFusionState(args.at(Is)))...);
+            fd.getFusionState(args_.at(Is).index))...);
   }
 
-  void operator()(FusionDefinition& fd) final {
+  virtual void operator()(FusionDefinition& fd) final {
     using arg_tuple_t = std::tuple<ArgTypes...>;
     auto indices =
         std::make_index_sequence<std::tuple_size<arg_tuple_t>::value>();
     // The tuple variable is never populated, it is passed for its type.
     arg_tuple_t inputs;
     auto output = opFunc(fd, inputs, indices);
-    fd.setFusionState(outputs.at(0), output);
+    fd.setFusionState(outputs_.at(0).index, output);
   }
 
  private:
@@ -88,21 +276,82 @@ struct OpRecord : RecordFunctor {
 
 struct SqueezeOpRecord : RecordFunctor {
   SqueezeOpRecord(
-      std::vector<size_t> _args,
-      std::vector<size_t> _outputs,
+      std::vector<State> _args,
+      std::vector<State> _outputs,
       std::vector<int64_t>& original_shape,
       int64_t dim)
-      : RecordFunctor(std::move(_args), std::move(_outputs)),
+      : RecordFunctor(
+            std::move(_args),
+            std::move(_outputs),
+            "squeeze",
+            RecordType::SqueezeOp),
         original_shape_(std::move(original_shape)),
         dim_(dim) {}
   virtual ~SqueezeOpRecord() = default;
+  virtual RecordFunctor* clone() final {
+    return new SqueezeOpRecord(*this);
+  }
 
-  void operator()(FusionDefinition& fd) final {
-    auto arg = fd.getFusionState(args.at(0))->template as<TensorView>();
+  //! Child specific hash function in lower 32 bits.
+  //! | 31 -------------- 16 | 15 --------------  0 |
+  //! | Squeeze Dim hash     | original_shape hash  |
+  virtual size_t hash() const final {
+    auto result = RecordFunctor::hash();
+    size_t original_shape_hash = 0;
+    for (auto shape : original_shape_) {
+      original_shape_hash ^= static_cast<size_t>(shape);
+    }
+    size_t squeeze_dim_hash = static_cast<size_t>(dim_);
+    squeeze_dim_hash = (squeeze_dim_hash & 0xffff) << 16;
+    return result | squeeze_dim_hash | (original_shape_hash & 0xffff);
+  }
 
-    auto output = torch::jit::fuser::cuda::squeeze(arg, original_shape_, dim_);
+  virtual bool operator==(const RecordFunctor& other) const final {
+    auto result = false;
+    if (auto child_ptr = dynamic_cast<const SqueezeOpRecord*>(&other)) {
+      result = RecordFunctor::operator==(other);
+      if (result) {
+        result = (original_shape_.size() == child_ptr->original_shape_.size());
+        if (result) {
+          result = (dim_ == child_ptr->dim_);
+        }
+        if (result) {
+          for (size_t i = 0; i < original_shape_.size(); ++i) {
+            if (original_shape_[i] != child_ptr->original_shape_[i]) {
+              result = false;
+              break;
+            }
+          }
+        }
+      }
+    }
+    return result;
+  }
 
-    fd.setFusionState(outputs.at(0), output);
+  void operator()(FusionDefinition& fd) final {
+    auto arg =
+        fd.getFusionState(args_.at(0).index)->template as<Nvf::TensorView>();
+    auto output = Nvf::squeeze(arg, original_shape_, dim_);
+    fd.setFusionState(outputs_.at(0).index, output);
+  }
+
+  virtual void print(std::ostream& os, bool close_function = true) const {
+    RecordFunctor::print(os, false);
+    os << ", original_shape=[";
+    bool first_arg = true;
+    for (auto shape : original_shape_) {
+      if (first_arg) {
+        first_arg = false;
+      } else {
+        os << ", ";
+      }
+      os << shape;
+    }
+    os << "]";
+    os << ", dim=" << dim_;
+    if (close_function) {
+      os << ")";
+    }
   }
 
  private:
@@ -116,17 +365,72 @@ struct SqueezeOpRecord : RecordFunctor {
 
 struct BroadcastOpRecord : RecordFunctor {
   BroadcastOpRecord(
-      std::vector<size_t> _args,
-      std::vector<size_t> _outputs,
+      std::vector<State> _args,
+      std::vector<State> _outputs,
+      std::string _name,
       std::vector<int64_t>& output_shape,
       std::vector<int64_t>& broadcast_dims)
-      : RecordFunctor(std::move(_args), std::move(_outputs)),
+      : RecordFunctor(
+            std::move(_args),
+            std::move(_outputs),
+            _name,
+            RecordType::BroadcastOp),
         output_shape_(std::move(output_shape)),
         broadcast_dims_(std::move(broadcast_dims)) {}
   virtual ~BroadcastOpRecord() = default;
+  virtual RecordFunctor* clone() final {
+    return new BroadcastOpRecord(*this);
+  }
 
-  void operator()(FusionDefinition& fd) final {
-    auto arg = fd.getFusionState(args.at(0))->template as<TensorView>();
+  //! Child specific hash function in lower 32 bits.
+  //! | 31 -------------- 16 | 15 --------------  0 |
+  //! | broadcast_dims hash  | output_shape hash    |
+  virtual size_t hash() const final {
+    auto result = RecordFunctor::hash();
+    size_t output_shape_hash = 0;
+    for (auto shape : output_shape_) {
+      output_shape_hash ^= static_cast<size_t>(shape);
+    }
+    size_t broadcast_dims_hash = 0;
+    for (auto dim : broadcast_dims_) {
+      broadcast_dims_hash |= 1 << ((output_shape_.size() - 1) - dim);
+    }
+    broadcast_dims_hash = (broadcast_dims_hash & 0xffff) << 16;
+    return result | broadcast_dims_hash | (output_shape_hash & 0xffff);
+  }
+
+  virtual bool operator==(const RecordFunctor& other) const final {
+    auto result = false;
+    if (auto child_ptr = dynamic_cast<const BroadcastOpRecord*>(&other)) {
+      result = RecordFunctor::operator==(other);
+      if (result) {
+        result =
+            ((output_shape_.size() == child_ptr->output_shape_.size()) &&
+             (broadcast_dims_.size() == child_ptr->broadcast_dims_.size()));
+        if (result) {
+          for (size_t i = 0; i < output_shape_.size(); ++i) {
+            if (output_shape_[i] != child_ptr->output_shape_[i]) {
+              result = false;
+              break;
+            }
+          }
+        }
+        if (result) {
+          for (size_t i = 0; i < broadcast_dims_.size(); ++i) {
+            if (broadcast_dims_[i] != child_ptr->broadcast_dims_[i]) {
+              result = false;
+              break;
+            }
+          }
+        }
+      }
+    }
+    return result;
+  }
+
+  virtual void operator()(FusionDefinition& fd) final {
+    auto arg =
+        fd.getFusionState(args_.at(0).index)->template as<Nvf::TensorView>();
 
     const auto& arg_domains_nr = arg->domain()->noReductions();
     const auto arg_ndims = arg_domains_nr.size();
@@ -168,18 +472,48 @@ struct BroadcastOpRecord : RecordFunctor {
           output_shape_[idx] != -1) {
         // TODO: this would be tricky to handle on dynamic shapes, we'll
         // need to pass-in a symbol instead somehow.
-        output_shape_on_bcast[idx] = IrBuilder::create<Int>(output_shape_[idx]);
+        output_shape_on_bcast[idx] =
+            Nvf::IrBuilder::create<Nvf::Int>(output_shape_[idx]);
         has_expand = true;
       } else {
-        output_shape_on_bcast[idx] = IrBuilder::create<Int>(-1);
+        output_shape_on_bcast[idx] = Nvf::IrBuilder::create<Nvf::Int>(-1);
       }
     }
 
-    auto output = torch::jit::fuser::cuda::broadcast(arg, is_broadcast_dim);
+    auto output = Nvf::broadcast(arg, is_broadcast_dim);
     if (has_expand) {
-      output = torch::jit::fuser::cuda::expand(output, output_shape_on_bcast);
+      output = Nvf::expand(output, output_shape_on_bcast);
+    }
+    fd.setFusionState(outputs_.at(0).index, output);
+  }
+
+  virtual void print(std::ostream& os, bool close_function = true) const {
+    RecordFunctor::print(os, false);
+    os << ", output_shape=[";
+    bool first_arg = true;
+    for (auto shape : output_shape_) {
+      if (first_arg) {
+        first_arg = false;
+      } else {
+        os << ", ";
+      }
+      os << shape;
+    }
+    os << "]";
+    os << ", broadcast_dims=[";
+    first_arg = true;
+    for (auto dim : broadcast_dims_) {
+      if (first_arg) {
+        first_arg = false;
+      } else {
+        os << ", ";
+      }
+      os << dim;
+    }
+    os << "]";
+    if (close_function) {
+      os << ")";
     }
-    fd.setFusionState(outputs.at(0), output);
   }
 
  private:
@@ -194,39 +528,140 @@ struct BroadcastOpRecord : RecordFunctor {
 template <class OutType, class ArgType>
 struct CastOpRecord : RecordFunctor {
   CastOpRecord(
-      std::vector<size_t> _args,
-      std::vector<size_t> _outputs,
-      std::function<OutType(NvfDataType, ArgType)> fusion_op,
-      NvfDataType dtype)
-      : RecordFunctor(std::move(_args), std::move(_outputs)),
+      std::vector<State> _args,
+      std::vector<State> _outputs,
+      std::string _name,
+      std::function<OutType(Nvf::DataType, ArgType)> fusion_op,
+      Nvf::DataType dtype)
+      : RecordFunctor(
+            std::move(_args),
+            std::move(_outputs),
+            _name,
+            RecordType::CastOp),
         fusion_op_(fusion_op),
         dtype_(dtype) {}
   virtual ~CastOpRecord() = default;
+  virtual RecordFunctor* clone() final {
+    return new CastOpRecord(*this);
+  }
 
-  void operator()(FusionDefinition& fd) final {
-    auto arg = dynamic_cast<ArgType>(fd.getFusionState(args.at(0)));
+  //! Child specific hash function in lower 32 bits.
+  //! | 31 --- 24 | 23 --------------------------  0 |
+  //! | Dtype     | Arith Function Sig hash code     |
+  virtual size_t hash() const final {
+    auto result = RecordFunctor::hash();
+    result |= ((static_cast<size_t>(dtype_) & 0xff) << 24);
+    result |= (fusion_op_.target_type().hash_code() & 0xffffff);
+    return result;
+  }
+
+  virtual bool operator==(const RecordFunctor& other) const final {
+    auto result = false;
+    if (auto child_ptr = dynamic_cast<const CastOpRecord*>(&other)) {
+      result = RecordFunctor::operator==(other);
+      if (result) {
+        result = result &&
+            (fusion_op_.target_type() == child_ptr->fusion_op_.target_type());
+        if (Nvf::isDebugDumpEnabled(
+                Nvf::DebugDumpOption::PythonFrontendDebug)) {
+          std::cout << "\nCastOpRecord: " << name_ << " Target Type [self: 0x"
+                    << fusion_op_.target_type().name() << "] [other: 0x"
+                    << child_ptr->fusion_op_.target_type().name() << "]";
+        }
+        // IMPORTANT! you need to dereference the target pointer in order
+        // to match the function
+        result = result &&
+            (*fusion_op_
+                  .template target<OutType (*)(Nvf::DataType, ArgType)>() ==
+             *child_ptr->fusion_op_
+                  .template target<OutType (*)(Nvf::DataType, ArgType)>());
+        if (Nvf::isDebugDumpEnabled(
+                Nvf::DebugDumpOption::PythonFrontendDebug)) {
+          std::cout
+              << " Target  Ptr [self: 0x" << std::hex
+              << (size_t)*fusion_op_
+                     .template target<OutType (*)(Nvf::DataType, ArgType)>()
+              << "] [other: 0x" << std::hex
+              << (size_t)*child_ptr->fusion_op_
+                     .template target<OutType (*)(Nvf::DataType, ArgType)>()
+              << "]\n";
+        }
+        result = result && (dtype_ == child_ptr->dtype_);
+      }
+    }
+    return result;
+  }
+
+  virtual void operator()(FusionDefinition& fd) final {
+    auto arg = dynamic_cast<ArgType>(fd.getFusionState(args_.at(0).index));
     auto output = fusion_op_(dtype_, arg);
-    fd.setFusionState(outputs.at(0), output);
+    fd.setFusionState(outputs_.at(0).index, output);
+  }
+
+  virtual void print(std::ostream& os, bool close_function = true) const {
+    RecordFunctor::print(os, false);
+    os << ", dtype=" << dtypeToPyString(dtype_);
+    if (close_function) {
+      os << ")";
+    }
   }
 
  private:
   //! nvFuser arith function signature
-  std::function<OutType(NvfDataType, ArgType)> fusion_op_;
+  std::function<OutType(Nvf::DataType, ArgType)> fusion_op_;
   //! Type to cast to.
-  NvfDataType dtype_;
+  Nvf::DataType dtype_;
 };
 
 //! Specialized Record Functor for recording FusionDefinition constant state.
 
 template <typename ExprType, typename ValueType>
 struct ConstantRecord : RecordFunctor {
-  ConstantRecord(std::vector<size_t> _outputs, ValueType val)
-      : RecordFunctor({}, std::move(_outputs)), value_(val) {}
+  ConstantRecord(std::vector<State> _outputs, ValueType val)
+      : RecordFunctor(
+            {},
+            std::move(_outputs),
+            "define_constant",
+            RecordType::Constant),
+        value_(val) {}
   virtual ~ConstantRecord() = default;
+  virtual RecordFunctor* clone() final {
+    return new ConstantRecord(*this);
+  }
 
-  void operator()(FusionDefinition& fd) final {
-    NvfVal* output = IrBuilder::create<ExprType>(value_);
-    fd.setFusionState(outputs.at(0), output);
+  //! Going to start out hashing nothing extra since hashing a complex number
+  //! seems complicated.  Initially, the thought was to simply static cast the
+  //! value_
+  virtual size_t hash() const final {
+    auto result = RecordFunctor::hash();
+    return result;
+  }
+
+  virtual bool operator==(const RecordFunctor& other) const final {
+    auto result = false;
+    if (auto child_ptr = dynamic_cast<const ConstantRecord*>(&other)) {
+      result = RecordFunctor::operator==(other);
+      result = result && (value_ == child_ptr->value_);
+    }
+    return result;
+  }
+
+  virtual void operator()(FusionDefinition& fd) final {
+    Nvf::Val* output = Nvf::IrBuilder::create<ExprType>(value_);
+    fd.setFusionState(outputs_.at(0).index, output);
+  }
+
+  virtual void print(std::ostream& os, bool close_function = true) const {
+    RecordFunctor::print(os, false);
+    if (std::is_same<ValueType, bool>::value) {
+      os << (value_ ? "True" : "False");
+    } else {
+      os << std::showpoint << value_;
+    }
+
+    if (close_function) {
+      os << ")";
+    }
   }
 
  private:
@@ -234,67 +669,209 @@ struct ConstantRecord : RecordFunctor {
   ValueType value_;
 };
 
+//! Specialized Record Functor for recording FusionDefinition End.
+//! The accompanying Fusion Cache Entry holds a Fusion Object.
+
+struct EndRecord : RecordFunctor {
+  EndRecord() : RecordFunctor({}, {}, "end", RecordType::End) {}
+  virtual ~EndRecord() = default;
+  virtual RecordFunctor* clone() final {
+    return new EndRecord(*this);
+  }
+
+  //! Child specific hash function in lower 32 bits.
+  //! | 31 ---------------------------------------  0 |
+  //! | None                                          |
+  virtual size_t hash() const final {
+    return RecordFunctor::hash();
+  }
+
+  virtual bool operator==(const RecordFunctor& other) const final {
+    auto result = false;
+    if (dynamic_cast<const EndRecord*>(&other)) {
+      result = RecordFunctor::operator==(other);
+    }
+    return result;
+  }
+
+  virtual void operator()(FusionDefinition& fd) final {}
+};
+
 //! Specialized Record Functor for recording FusionDefinition input tensors.
 
-struct InputTensorRecord : RecordFunctor {
-  InputTensorRecord(
-      std::vector<size_t> _outputs,
+struct TensorRecord : RecordFunctor {
+  TensorRecord(
+      std::vector<State> _outputs,
       std::vector<int64_t> _symbolic_sizes,
       std::vector<bool> _contiguous_info,
-      NvfDataType _dtype,
+      Nvf::DataType _dtype,
       bool _is_cpu = false)
-      : RecordFunctor({}, std::move(_outputs)),
-        symbolic_sizes(std::move(_symbolic_sizes)),
-        contiguous_info(std::move(_contiguous_info)),
-        dtype(_dtype),
-        is_cpu(_is_cpu) {}
-  virtual ~InputTensorRecord() = default;
+      : RecordFunctor(
+            {},
+            std::move(_outputs),
+            "define_tensor",
+            RecordType::Tensor),
+        symbolic_sizes_(std::move(_symbolic_sizes)),
+        contiguous_info_(std::move(_contiguous_info)),
+        dtype_(_dtype),
+        is_cpu_(_is_cpu) {}
+  virtual ~TensorRecord() = default;
+  virtual RecordFunctor* clone() final {
+    return new TensorRecord(*this);
+  }
 
-  void operator()(FusionDefinition& fd) final {
-    auto tv = TensorViewBuilder()
-                  .ndims(symbolic_sizes.size())
-                  .contiguity(contiguous_info)
-                  .shape(symbolic_sizes)
-                  .dtype(dtype)
+  //! Child specific hash function in lower 32 bits.
+  //! |  31  | 30 --- 24 | 23 --------- 12 | 11 ---------  0 |
+  //! | CPU? | Dtype     | Symbolic Sizes  | Contiguous Info |
+  virtual size_t hash() const final {
+    auto result = RecordFunctor::hash();
+    size_t ssize_hash = 0;
+    for (size_t i = 0; i < symbolic_sizes_.size(); ++i) {
+      size_t ssize = 0;
+      if (symbolic_sizes_[i] == -1) {
+        ssize = 1;
+      }
+      ssize_hash |= (ssize << (symbolic_sizes_.size() - 1 - i));
+    }
+    size_t contig_hash = 0;
+    for (size_t i = 0; i < contiguous_info_.size(); ++i) {
+      contig_hash |= (contiguous_info_[i] << (contiguous_info_.size() - 1 - i));
+    }
+
+    result |= ((static_cast<size_t>(is_cpu_) & 0x1) << 31);
+    result |= ((static_cast<size_t>(dtype_) & 0x7f) << 24);
+    return result | ((ssize_hash & 0xfff) << 12) | (contig_hash & 0xfff);
+  }
+
+  virtual bool operator==(const RecordFunctor& other) const final {
+    auto result = false;
+    if (auto child_ptr = dynamic_cast<const TensorRecord*>(&other)) {
+      result = RecordFunctor::operator==(other);
+      result = result && (dtype_ == child_ptr->dtype_);
+      result = result && (is_cpu_ == child_ptr->is_cpu_);
+      if (result) {
+        result =
+            ((symbolic_sizes_.size() == child_ptr->symbolic_sizes_.size()) &&
+             (contiguous_info_.size() == child_ptr->contiguous_info_.size()));
+        if (result) {
+          for (size_t i = 0; i < symbolic_sizes_.size(); ++i) {
+            if (symbolic_sizes_[i] != child_ptr->symbolic_sizes_[i]) {
+              result = false;
+              break;
+            }
+          }
+        }
+        if (result) {
+          for (size_t i = 0; i < contiguous_info_.size(); ++i) {
+            if (contiguous_info_[i] != child_ptr->contiguous_info_[i]) {
+              result = false;
+              break;
+            }
+          }
+        }
+      }
+    }
+    return result;
+  }
+
+  virtual void operator()(FusionDefinition& fd) final {
+    auto tv = Nvf::TensorViewBuilder()
+                  .ndims(symbolic_sizes_.size())
+                  .contiguity(contiguous_info_)
+                  .shape(symbolic_sizes_)
+                  .dtype(dtype_)
                   .build();
 
-    if (symbolic_sizes.empty() && is_cpu) {
+    if (symbolic_sizes_.empty() && is_cpu_) {
       tv->setCpuScalar(true);
     } else {
-      TORCH_CHECK(!is_cpu, "cpu non-scalar tensor is not supported");
+      TORCH_CHECK(!is_cpu_, "CPU non-scalar tensor is not supported!");
     }
 
-    fd.setFusionState(outputs.at(0), tv);
+    fd.setFusionState(outputs_.at(0).index, tv);
     fd.addInput(tv);
   }
 
+  virtual void print(std::ostream& os, bool close_function = true) const {
+    RecordFunctor::print(os, false);
+    os << "symbolic_sizes=[";
+    bool first_arg = true;
+    for (auto ss : symbolic_sizes_) {
+      if (first_arg) {
+        first_arg = false;
+      } else {
+        os << ", ";
+      }
+      os << ss;
+    }
+    os << "], contiguous=[";
+    first_arg = true;
+    for (auto ci : contiguous_info_) {
+      if (first_arg) {
+        first_arg = false;
+      } else {
+        os << ", ";
+      }
+      if (ci) {
+        os << "True";
+      } else {
+        os << "False";
+      }
+    }
+    os << "], dtype=" << dtypeToPyString(dtype_);
+    if (close_function) {
+      os << ")";
+    }
+  }
+
+ private:
   //! A vector of tensor dimension sizes.
   //! This vector only captures sizes of -1 or 1 to indicate a symbolic
   //! dimension (-1) or a broadcast dimension (1).
-  std::vector<int64_t> symbolic_sizes;
+  std::vector<int64_t> symbolic_sizes_;
   //! A vector to indicate whether the a tensor dimension is contiguous
   //! with the dimension just to its right.
-  std::vector<bool> contiguous_info;
-  //! Tensor data type.
-  NvfDataType dtype;
+  std::vector<bool> contiguous_info_;
   //! Tensor data type.
-  bool is_cpu;
+  Nvf::DataType dtype_;
+  //! Notes a scalar CPU Tensor
+  bool is_cpu_;
 };
 
 //! Specialized Record Functor for recording FusionDefinition outputs.
 
 template <class OutputType>
 struct OutputRecord : RecordFunctor {
-  OutputRecord(std::vector<size_t> _args)
-      : RecordFunctor(std::move(_args), {}) {}
+  OutputRecord(std::vector<State> _args)
+      : RecordFunctor(std::move(_args), {}, "add_output", RecordType::Output) {}
   virtual ~OutputRecord() = default;
+  virtual RecordFunctor* clone() final {
+    return new OutputRecord(*this);
+  }
 
-  void operator()(FusionDefinition& fd) final {
-    auto input = fd.getFusionState(args.at(0));
+  //! Nothing extra necessary in hash
+  //! Child specific hash function in lower 32 bits.
+  //! | 31 ---------------------------------------  0 |
+  //! | None                                          |
+  virtual size_t hash() const final {
+    auto result = RecordFunctor::hash();
+    return result;
+  }
+
+  virtual bool operator==(const RecordFunctor& other) const final {
+    auto result = false;
+    if (auto child_ptr = dynamic_cast<const OutputRecord*>(&other)) {
+      result = RecordFunctor::operator==(other);
+    }
+    return result;
+  }
+
+  virtual void operator()(FusionDefinition& fd) final {
+    auto input = fd.getFusionState(args_.at(0).index);
 
     // With C++17, this statement should be "if constexpr"
-    if (std::is_same<OutputType, NvfTensorView>::value) {
-      fd.addOutput(input->template as<NvfTensorView>());
+    if (std::is_same<OutputType, Nvf::TensorView>::value) {
+      fd.addOutput(input->template as<Nvf::TensorView>());
     } else {
       fd.addOutput(input);
     }
@@ -305,92 +882,315 @@ struct OutputRecord : RecordFunctor {
 
 struct ReductionOpRecord : RecordFunctor {
   ReductionOpRecord(
-      std::vector<size_t> _args,
-      std::vector<size_t> _outputs,
-      std::function<
-          NvfTensorView*(NvfTensorView*, std::vector<int>&, bool, NvfDataType)>
-          fusion_op,
+      std::vector<State> _args,
+      std::vector<State> _outputs,
+      std::string _name,
+      std::function<Nvf::TensorView*(
+          Nvf::TensorView*,
+          const std::vector<int>&,
+          bool,
+          Nvf::DataType)> fusion_op,
       std::vector<int> axes,
       bool keep_dim,
-      NvfDataType dtype)
-      : RecordFunctor(std::move(_args), std::move(_outputs)),
+      Nvf::DataType dtype)
+      : RecordFunctor(
+            std::move(_args),
+            std::move(_outputs),
+            _name,
+            RecordType::ReductionOp),
         fusion_op_(fusion_op),
         axes_(std::move(axes)),
         keep_dim_(keep_dim),
         dtype_(dtype) {}
   virtual ~ReductionOpRecord() = default;
+  virtual RecordFunctor* clone() final {
+    return new ReductionOpRecord(*this);
+  }
 
-  void operator()(FusionDefinition& fd) final {
-    auto arg = fd.getFusionState(args.at(0))->template as<NvfTensorView>();
+  //! Child specific hash function in lower 32 bits.
+  //! | 31 -- 28 | 27 --- 20 | 19 -----------------  0 |
+  //! | keep_dim | Dtype     | Axes Hash               |
+  virtual size_t hash() const final {
+    auto result = RecordFunctor::hash();
+    size_t axes_hash = 0;
+    // Normally I would make a little endian hash of the axes but I do not
+    // know the size of the tensor based on just the record information.
+    for (size_t i = 0; i < axes_.size(); ++i) {
+      axes_hash |= (1 << axes_[i]);
+    }
+
+    return result | (static_cast<size_t>(keep_dim_) << 28) |
+        ((static_cast<size_t>(dtype_) & 0xff) << 20) | (axes_hash & 0xfffff);
+  }
+
+  virtual bool operator==(const RecordFunctor& other) const final {
+    auto result = false;
+    if (auto child_ptr = dynamic_cast<const ReductionOpRecord*>(&other)) {
+      result = RecordFunctor::operator==(other);
+      if (result) {
+        result = result &&
+            (fusion_op_.target_type() == child_ptr->fusion_op_.target_type());
+        if (Nvf::isDebugDumpEnabled(
+                Nvf::DebugDumpOption::PythonFrontendDebug)) {
+          std::cout << "\nReductionOpRecord: " << name_
+                    << " Target Type [self: 0x"
+                    << fusion_op_.target_type().name() << "] [other: 0x"
+                    << child_ptr->fusion_op_.target_type().name() << "]";
+        }
+        // IMPORTANT! you need to dereference the target pointer in order
+        // to match the function
+        result = result &&
+            (*fusion_op_.template target<
+                 Nvf::
+                     TensorView* (*)(Nvf::TensorView*, const std::vector<int>&, bool, Nvf::DataType)>() ==
+             *child_ptr->fusion_op_.template target<
+                 Nvf::
+                     TensorView* (*)(Nvf::TensorView*, const std::vector<int>&, bool, Nvf::DataType)>());
+        if (Nvf::isDebugDumpEnabled(
+                Nvf::DebugDumpOption::PythonFrontendDebug)) {
+          std::cout
+              << " Target  Ptr [self: 0x" << std::hex
+              << (size_t)*fusion_op_.template target<
+                     Nvf::
+                         TensorView* (*)(Nvf::TensorView*, const std::vector<int>&, bool, Nvf::DataType)>()
+              << "] [other: 0x" << std::hex
+              << (size_t)*child_ptr->fusion_op_.template target<
+                     Nvf::
+                         TensorView* (*)(Nvf::TensorView*, const std::vector<int>&, bool, Nvf::DataType)>()
+              << "]\n";
+        }
+        result = result && (keep_dim_ == child_ptr->keep_dim_);
+        result = result && (dtype_ == child_ptr->dtype_);
+        if (result) {
+          result = (axes_.size() == child_ptr->axes_.size());
+          if (result) {
+            for (size_t i = 0; i < axes_.size(); ++i) {
+              if (axes_[i] != child_ptr->axes_[i]) {
+                result = false;
+                break;
+              }
+            }
+          }
+        }
+      }
+    }
+    return result;
+  }
+
+  virtual void operator()(FusionDefinition& fd) final {
+    auto arg =
+        fd.getFusionState(args_.at(0).index)->template as<Nvf::TensorView>();
     auto output = fusion_op_(arg, axes_, keep_dim_, dtype_);
-    fd.setFusionState(outputs.at(0), output);
+    fd.setFusionState(outputs_.at(0).index, output);
+  }
+
+  virtual void print(std::ostream& os, bool close_function = true) const {
+    RecordFunctor::print(os, false);
+    os << ", axes=[";
+    bool first_arg = true;
+    for (auto axis : axes_) {
+      if (first_arg) {
+        first_arg = false;
+      } else {
+        os << ", ";
+      }
+      os << axis;
+    }
+    os << "]";
+    os << ", keepdim=" << (keep_dim_ ? "True" : "False");
+    os << ", dtype=" << dtypeToPyString(dtype_);
+    if (close_function) {
+      os << ")";
+    }
   }
 
  private:
   //! nvFuser arith function signature for a given reduction operation
-  std::function<
-      NvfTensorView*(NvfTensorView*, std::vector<int>&, bool, NvfDataType)>
+  std::function<Nvf::TensorView*(
+      Nvf::TensorView*,
+      const std::vector<int>&,
+      bool,
+      Nvf::DataType)>
       fusion_op_;
   //! The tensor dimensions to reduce
   std::vector<int> axes_;
   //! Indicates whether to keep the reduced dimension(s).
   bool keep_dim_;
   //! The output data type.
-  NvfDataType dtype_;
+  Nvf::DataType dtype_;
 };
 
 //! Specialized Record Functor for recording FusionDefinition input scalars.
 
 struct ScalarRecord : RecordFunctor {
-  ScalarRecord(std::vector<size_t> _outputs, NvfDataType dtype)
-      : RecordFunctor({}, std::move(_outputs)), dtype_(dtype) {}
+  ScalarRecord(std::vector<State> _outputs, Nvf::DataType dtype)
+      : RecordFunctor(
+            {},
+            std::move(_outputs),
+            "define_scalar",
+            RecordType::Scalar),
+        dtype_(dtype) {}
   virtual ~ScalarRecord() = default;
+  virtual RecordFunctor* clone() final {
+    return new ScalarRecord(*this);
+  }
 
-  void operator()(FusionDefinition& fd) final {
-    NvfVal* output = nullptr;
-    if (dtype_ == NvfDataType::Double) {
-      output = IrBuilder::create<torch::jit::fuser::cuda::Double>();
-    } else if (dtype_ == NvfDataType::ComplexDouble) {
-      output = IrBuilder::create<torch::jit::fuser::cuda::ComplexDouble>();
-    } else if (dtype_ == NvfDataType::Bool) {
-      output = IrBuilder::create<torch::jit::fuser::cuda::Bool>();
-    } else if (dtype_ == NvfDataType::Int) {
-      output = IrBuilder::create<torch::jit::fuser::cuda::Int>();
+  //! Child specific hash function in lower 32 bits.
+  //! | 31 ---------------------------------------  0 |
+  //! | Dtype                                         |
+  virtual size_t hash() const final {
+    auto result = RecordFunctor::hash();
+    return result | (static_cast<size_t>(dtype_) & 0xffffffff);
+  }
+
+  virtual bool operator==(const RecordFunctor& other) const final {
+    auto result = false;
+    if (auto child_ptr = dynamic_cast<const ScalarRecord*>(&other)) {
+      result = RecordFunctor::operator==(other);
+      result = result && (dtype_ == child_ptr->dtype_);
+    }
+    return result;
+  }
+
+  virtual void operator()(FusionDefinition& fd) final {
+    Nvf::Val* output = nullptr;
+    if (dtype_ == Nvf::DataType::Double) {
+      output = Nvf::IrBuilder::create<Nvf::Double>();
+    } else if (dtype_ == Nvf::DataType::ComplexDouble) {
+      output = Nvf::IrBuilder::create<Nvf::ComplexDouble>();
+    } else if (dtype_ == Nvf::DataType::Bool) {
+      output = Nvf::IrBuilder::create<Nvf::Bool>();
+    } else if (dtype_ == Nvf::DataType::Int) {
+      output = Nvf::IrBuilder::create<Nvf::Int>();
     } else {
       TORCH_CHECK(false, "Dtype is not supported:", dtype_);
     }
     fd.addInput(output);
-    fd.setFusionState(outputs.at(0), output);
+    fd.setFusionState(outputs_.at(0).index, output);
+  }
+
+  virtual void print(std::ostream& os, bool close_function = true) const {
+    RecordFunctor::print(os, false);
+    os << "dtype=" << dtypeToPyString(dtype_);
+    if (close_function) {
+      os << ")";
+    }
   }
 
  private:
   //! Scalar data type.
-  NvfDataType dtype_;
+  Nvf::DataType dtype_;
 };
 
-//! Specialized Record Functor for the FusionDefinition's var op.
+//! Specialized Record Functor for recording FusionDefinition Start.
+//! There should only ever be one instance of this Record in the
+//! Fusion Cache.
 
-struct VarianceOpRecord : RecordFunctor {
-  VarianceOpRecord(
-      std::vector<size_t> _args,
-      std::vector<size_t> _outputs,
+struct StartRecord : RecordFunctor {
+  StartRecord() : RecordFunctor({}, {}, "start", RecordType::Start) {}
+  virtual ~StartRecord() = default;
+  virtual RecordFunctor* clone() final {
+    return new StartRecord(*this);
+  }
+
+  //! Child specific hash function in lower 32 bits.
+  //! | 31 ---------------------------------------  0 |
+  //! | None                                          |
+  virtual size_t hash() const final {
+    return RecordFunctor::hash();
+  }
+
+  virtual bool operator==(const RecordFunctor& other) const final {
+    auto result = false;
+    if (dynamic_cast<const StartRecord*>(&other)) {
+      result = RecordFunctor::operator==(other);
+    }
+    return result;
+  }
+
+  virtual void operator()(FusionDefinition& fd) final {}
+};
+
+//! Specialized Record Functors for Normalization based ops.
+
+struct NormOpRecord : RecordFunctor {
+  NormOpRecord(
+      std::vector<State> args,
+      std::vector<State> outputs,
+      std::string name,
+      RecordType type,
       std::vector<int>& axes,
       int64_t correction,
       bool keep_dim)
-      : RecordFunctor(std::move(_args), std::move(_outputs)),
+      : RecordFunctor(std::move(args), std::move(outputs), name, type),
         axes_(axes),
         correction_(correction),
         keep_dim_(keep_dim) {}
-  virtual ~VarianceOpRecord() = default;
+  virtual ~NormOpRecord() = default;
+  virtual RecordFunctor* clone() = 0;
 
-  void operator()(FusionDefinition& fd) final {
-    auto arg = fd.getFusionState(args.at(0))->as<NvfTensorView>();
-    auto output =
-        torch::jit::fuser::cuda::variance(arg, axes_, correction_, keep_dim_);
-    fd.setFusionState(outputs.at(0), output);
+  // I am skipping the bassel's correction value in the hash because
+  // I suspect we might change it to a bool from a 64-bit value
+  //! Child specific hash function in lower 32 bits.
+  //! | 31 -- 28 | 27 -----------------------------  0 |
+  //! | keep_dim | Axes Hash                           |
+  virtual size_t hash() const final {
+    auto result = RecordFunctor::hash();
+    size_t axes_hash = 0;
+    // Normally I would make a little endian hash of the axes but I do not
+    // know the size of the tensor based on just the record information.
+    for (size_t i = 0; i < axes_.size(); ++i) {
+      axes_hash |= (1 << axes_[i]);
+    }
+    return result | (static_cast<size_t>(keep_dim_) << 28) |
+        (axes_hash & 0xfffffff);
   }
 
- private:
+  virtual bool operator==(const RecordFunctor& other) const final {
+    auto result = false;
+    if (auto child_ptr = dynamic_cast<const NormOpRecord*>(&other)) {
+      result = RecordFunctor::operator==(other);
+      result = result && (correction_ == child_ptr->correction_);
+      result = result && (keep_dim_ == child_ptr->keep_dim_);
+      if (result) {
+        result = (axes_.size() == child_ptr->axes_.size());
+        if (result) {
+          for (size_t i = 0; i < axes_.size(); ++i) {
+            if (axes_[i] != child_ptr->axes_[i]) {
+              result = false;
+              break;
+            }
+          }
+        }
+      }
+    }
+    return result;
+  }
+
+  //! Each NormOp Child should define the operator() to build the IR
+  virtual void operator()(FusionDefinition& fd) = 0;
+
+  virtual void print(std::ostream& os, bool close_function = true) const final {
+    RecordFunctor::print(os, false);
+    os << ", axes=[";
+    bool first_arg = true;
+    for (auto axis : axes_) {
+      if (first_arg) {
+        first_arg = false;
+      } else {
+        os << ", ";
+      }
+      os << axis;
+    }
+    os << "]";
+    os << ", correction=" << correction_;
+    os << ", keepdim=" << (keep_dim_ ? "True" : "False");
+    if (close_function) {
+      os << ")";
+    }
+  }
+
+ protected:
   //! Dimensions of tensor to reduce for variance calculation
   std::vector<int> axes_;
   //! Bessel's correction value
@@ -399,34 +1199,87 @@ struct VarianceOpRecord : RecordFunctor {
   bool keep_dim_;
 };
 
-struct VarianceMeanOpRecord : RecordFunctor {
+struct VarianceOpRecord : NormOpRecord {
+  VarianceOpRecord(
+      std::vector<State> args,
+      std::vector<State> outputs,
+      std::vector<int>& axes,
+      int64_t correction,
+      bool keep_dim)
+      : NormOpRecord(
+            std::move(args),
+            std::move(outputs),
+            "ops.var",
+            RecordType::VarianceOp,
+            axes,
+            correction,
+            keep_dim) {}
+  virtual ~VarianceOpRecord() = default;
+  virtual RecordFunctor* clone() final {
+    return new VarianceOpRecord(*this);
+  }
+
+  virtual void operator()(FusionDefinition& fd) final {
+    auto arg = fd.getFusionState(args_.at(0).index)->as<Nvf::TensorView>();
+    auto output = Nvf::variance(arg, axes_, correction_, keep_dim_);
+    fd.setFusionState(outputs_.at(0).index, output);
+  }
+};
+
+//! VarianceMean requires a separate Record because nvFuser defines the output
+//! of var_mean as a custom struct.
+struct VarianceMeanOpRecord : NormOpRecord {
   VarianceMeanOpRecord(
-      std::vector<size_t> _args,
-      std::vector<size_t> _outputs,
-      std::vector<int>& dims,
+      std::vector<State> args,
+      std::vector<State> outputs,
+      std::vector<int>& axes,
       int64_t correction,
-      bool keepdim)
-      : RecordFunctor(std::move(_args), std::move(_outputs)),
-        dims_(dims),
-        correction_(correction),
-        keepdim_(keepdim) {}
+      bool keep_dim)
+      : NormOpRecord(
+            std::move(args),
+            std::move(outputs),
+            "ops.var_mean",
+            RecordType::VarianceMeanOp,
+            axes,
+            correction,
+            keep_dim) {}
   virtual ~VarianceMeanOpRecord() = default;
+  virtual RecordFunctor* clone() final {
+    return new VarianceMeanOpRecord(*this);
+  }
 
   void operator()(FusionDefinition& fd) final {
-    auto arg = fd.getFusionState(args.at(0))->as<NvfTensorView>();
-    auto output = torch::jit::fuser::cuda::variance_mean(
-        arg, dims_, correction_, keepdim_);
-    fd.setFusionState(outputs.at(0), output.var);
-    fd.setFusionState(outputs.at(1), output.mean);
+    auto arg = fd.getFusionState(args_.at(0).index)->as<Nvf::TensorView>();
+    auto output = Nvf::variance_mean(arg, axes_, correction_, keep_dim_);
+    fd.setFusionState(outputs_.at(0).index, output.var);
+    fd.setFusionState(outputs_.at(1).index, output.mean);
   }
-
- private:
-  //! Dimensions of tensor to reduce for variance calculation
-  std::vector<int> dims_;
-  //! Bessel's correction value
-  int64_t correction_;
-  //! Indicates whether to keep the reduced dimension(s).
-  bool keepdim_;
 };
 
 } // namespace nvfuser
+
+//! Creating the template specialized hash and equal_to functions for a
+//! RecordFunctor object in order to use hash maps (unordered_maps) in STL.
+namespace std {
+using namespace nvfuser;
+
+template <>
+struct hash<RecordFunctor*> {
+  size_t operator()(const RecordFunctor* p) const {
+    TORCH_CHECK(p, "The RecordFunctor Pointer for hashing is null!");
+    return p->hash();
+  }
+};
+template <>
+struct equal_to<RecordFunctor*> {
+  bool operator()(const RecordFunctor* p, const RecordFunctor* q) const {
+    TORCH_CHECK(
+        p,
+        "The RecordFunctor Pointer on the lhs of an equality check is null!");
+    TORCH_CHECK(
+        q,
+        "The RecordFunctor Pointer on the rhs of an equality check is null!");
+    return p->operator==(*q);
+  }
+};
+} // namespace std
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
index 2d211560f70a..e09a24c2b606 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
@@ -4,10 +4,13 @@
 #include <c10/util/ArrayRef.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ops/composite.h>
+#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.h>
 #include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.h>
+#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.h>
 #include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h>
 #include <torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
@@ -24,27 +27,41 @@ void initNvFuserPythonBindings(PyObject* module) {
   auto nvfuser = m.def_submodule("_nvfuser");
 
   //! DataTypes supported by nvFuser in the FusionDefinition
-  py::enum_<NvfDataType>(nvfuser, "DataType")
-      .value("Double", NvfDataType::Double)
-      .value("Float", NvfDataType::Float)
-      .value("Half", NvfDataType::Half)
-      .value("Int", NvfDataType::Int)
-      .value("Int32", NvfDataType::Int32)
-      .value("Bool", NvfDataType::Bool)
-      .value("BFloat16", NvfDataType::BFloat16)
-      .value("ComplexFloat", NvfDataType::ComplexFloat)
-      .value("ComplexDouble", NvfDataType::ComplexDouble)
-      .value("Null", NvfDataType::Null);
-
-  //! Binding an object that owns a FusionExecutorCache instance and provides
-  //! an interface
-  //! \todo This object will be removed when a FusionManager is added
-  //! containing a cache.
-  py::class_<nvfuser::FusionOwner> fusion(nvfuser, "Fusion");
+  py::enum_<Nvf::DataType>(nvfuser, "DataType")
+      .value("Double", Nvf::DataType::Double)
+      .value("Float", Nvf::DataType::Float)
+      .value("Half", Nvf::DataType::Half)
+      .value("Int", Nvf::DataType::Int)
+      .value("Int32", Nvf::DataType::Int32)
+      .value("Bool", Nvf::DataType::Bool)
+      .value("BFloat16", Nvf::DataType::BFloat16)
+      .value("ComplexFloat", Nvf::DataType::ComplexFloat)
+      .value("ComplexDouble", Nvf::DataType::ComplexDouble)
+      .value("Null", Nvf::DataType::Null);
+
+  //! Binding the FusionCache that holds a cache of Fusions
+  //! This is only bound to provide an interface to get the number of fusions
+  //! that are cached.
+  py::class_<nvfuser::FusionCache> fusion_cache(nvfuser, "FusionCache");
+  fusion_cache
+      .def_static(
+          "get",
+          &nvfuser::FusionCache::get,
+          py::arg("max_fusions") = int(8192),
+          py::return_value_policy::reference)
+      .def("num_fusions", &nvfuser::FusionCache::numFusions)
+      .def("print_stats", [](nvfuser::FusionCache& self) {
+        self.print(std::cout);
+      });
+
+  py::class_<nvfuser::FusionInterface> fusion(nvfuser, "Fusion");
   fusion.def(py::init<>())
+      .def(py::init<size_t>(), py::arg("fusion_id"))
+      .def("define", &nvfuser::FusionInterface::define)
+      .def("defined", &nvfuser::FusionInterface::defined)
       .def(
           "execute",
-          [](nvfuser::FusionOwner& self, const py::iterable& iter) {
+          [](nvfuser::FusionInterface& self, const py::iterable& iter) {
             std::vector<IValue> inputs;
             for (py::handle obj : iter) {
               inputs.push_back(toIValue(obj, c10::AnyType::get()));
@@ -52,10 +69,8 @@ void initNvFuserPythonBindings(PyObject* module) {
             return self.execute(inputs);
           },
           py::return_value_policy::reference)
-      .def("print_ir", [](nvfuser::FusionOwner& self) { self.printIr(); })
-      .def("print_kernel", [](nvfuser::FusionOwner& self) {
-        self.printKernel();
-      });
+      .def("id", &nvfuser::FusionInterface::id)
+      .def("print", &nvfuser::FusionInterface::print);
 
   //! These are the FusionDefinition supported object types that are either
   //! defined as inputs or the output of an operation.
@@ -66,11 +81,18 @@ void initNvFuserPythonBindings(PyObject* module) {
   //! define the set the operations and connections between operations for
   //! nvFuser to create.
   py::class_<nvfuser::FusionDefinition> fusion_def(nvfuser, "FusionDefinition");
-  fusion_def.def(py::init<nvfuser::FusionOwner*>())
+  fusion_def
+      .def(
+          py::init<nvfuser::FusionInterface*, int>(),
+          py::arg("fusion"),
+          py::arg("max_length") = int(256))
       .def_readwrite("ops", &nvfuser::FusionDefinition::ops)
       .def(
           "__enter__",
           [](nvfuser::FusionDefinition& self) -> nvfuser::FusionDefinition* {
+            // Instrumentation to mark the beginning of a FusionDefinition
+            Nvf::inst::Trace::instance()->beginEvent(
+                "FusionDefinition Context Manager");
             return self.enter();
           })
       .def(
@@ -78,47 +100,99 @@ void initNvFuserPythonBindings(PyObject* module) {
           [](nvfuser::FusionDefinition& self,
              void* exc_type,
              void* exc_value,
-             void* traceback) { self.exit(); })
+             void* traceback) {
+            self.exit();
+            // Mark the end of a FusionDefinition Context Manager
+            Nvf::inst::Trace::instance()->endEvent(nullptr);
+          })
+      .def(
+          "__str__",
+          [](nvfuser::FusionDefinition& self) {
+            std::stringstream ss;
+            self.print(ss);
+            return ss.str();
+          })
       .def(
           "add_output",
-          [](nvfuser::FusionDefinition& self, nvfuser::Scalar* output) {
-            self.defineRecord(
-                new nvfuser::OutputRecord<NvfVal>({output->index}));
+          [](nvfuser::FusionDefinition& self, nvfuser::Scalar output) {
+            FUSER_PERF_SCOPE("FusionDefinition.add_output (scalar)");
+            self.defineRecord(new nvfuser::OutputRecord<Nvf::Val>(
+                {self.recordingState(output())}));
           })
       .def(
           "add_output",
-          [](nvfuser::FusionDefinition& self, nvfuser::Tensor* output) {
-            self.defineRecord(
-                new nvfuser::OutputRecord<NvfTensorView>({output->index}));
+          [](nvfuser::FusionDefinition& self, nvfuser::Tensor output) {
+            FUSER_PERF_SCOPE("FusionDefinition.add_output (tensor)");
+            self.defineRecord(new nvfuser::OutputRecord<Nvf::TensorView>(
+                {self.recordingState(output())}));
           })
       .def(
           "define_tensor",
           [](nvfuser::FusionDefinition& self,
              size_t ndims,
-             NvfDataType dtype = NvfDataType::Float) -> nvfuser::Tensor* {
+             Nvf::DataType dtype = Nvf::DataType::Float,
+             bool is_cpu = false) -> nvfuser::Tensor {
+            FUSER_PERF_SCOPE("FusionDefinition.define_tensor (simple)");
             std::vector<int64_t> maybe_symbolic_sizes(ndims, -1);
             ;
             std::vector<bool> contig_info(ndims, false);
 
-            nvfuser::Tensor* out = self.defineTensor();
-            self.defineRecord(new nvfuser::InputTensorRecord(
-                {out->index},
+            nvfuser::Tensor out = self.defineTensor();
+            self.defineRecord(new nvfuser::TensorRecord(
+                {self.recordingState(out())},
                 std::move(maybe_symbolic_sizes),
                 std::move(contig_info),
-                dtype));
+                dtype,
+                is_cpu));
 
             return out;
           },
           py::arg("ndims"),
-          py::arg("dtype") = torch::jit::fuser::cuda::DataType::Float,
+          py::arg("dtype") = Nvf::DataType::Float,
+          py::arg("is_cpu") = false,
+          py::return_value_policy::reference)
+      .def(
+          "define_tensor",
+          [](nvfuser::FusionDefinition& self,
+             std::vector<int64_t>& symbolic_sizes,
+             std::vector<bool>& contiguous,
+             Nvf::DataType dtype = Nvf::DataType::Float,
+             bool is_cpu = false) -> nvfuser::Tensor {
+            FUSER_PERF_SCOPE("FusionDefinition.define_tensor (default)");
+
+            for (size_t i = 0; i < symbolic_sizes.size(); ++i) {
+              TORCH_CHECK(
+                  symbolic_sizes[i] == -1 || symbolic_sizes[i] == 1,
+                  "The value ",
+                  symbolic_sizes[i],
+                  " at index ",
+                  i,
+                  " was neither broadcast(1) or symbolic(-1).");
+            }
+
+            nvfuser::Tensor out = self.defineTensor();
+            self.defineRecord(new nvfuser::TensorRecord(
+                {self.recordingState(out())},
+                symbolic_sizes,
+                contiguous,
+                dtype,
+                is_cpu));
+
+            return out;
+          },
+          py::arg("symbolic_sizes"),
+          py::arg("contiguous"),
+          py::arg("dtype") = Nvf::DataType::Float,
+          py::arg("is_cpu") = false,
           py::return_value_policy::reference)
       .def(
           "define_tensor",
           [](nvfuser::FusionDefinition& self,
-             std::vector<int64_t> sizes,
-             std::vector<int64_t> strides,
-             NvfDataType dtype = NvfDataType::Float,
-             bool is_cpu = false) -> nvfuser::Tensor* {
+             std::vector<int64_t>& sizes,
+             std::vector<int64_t>& strides,
+             Nvf::DataType dtype = Nvf::DataType::Float,
+             bool is_cpu = false) -> nvfuser::Tensor {
+            FUSER_PERF_SCOPE("FusionDefinition.define_tensor (integration)");
             TORCH_CHECK(
                 sizes.size() == strides.size(),
                 "The number of sizes does not match the number of strides.",
@@ -155,9 +229,9 @@ void initNvFuserPythonBindings(PyObject* module) {
               }
             }
 
-            nvfuser::Tensor* out = self.defineTensor();
-            self.defineRecord(new nvfuser::InputTensorRecord(
-                {out->index},
+            nvfuser::Tensor out = self.defineTensor();
+            self.defineRecord(new nvfuser::TensorRecord(
+                {self.recordingState(out())},
                 std::move(maybe_symbolic_sizes),
                 std::move(contig_info),
                 dtype,
@@ -167,64 +241,64 @@ void initNvFuserPythonBindings(PyObject* module) {
           },
           py::arg("sizes"),
           py::arg("strides"),
-          py::arg("dtype") = NvfDataType::Float,
+          py::arg("dtype") = Nvf::DataType::Float,
           py::arg("is_cpu") = false,
           py::return_value_policy::reference)
       .def(
           "define_constant",
-          [](nvfuser::FusionDefinition& self, double val) -> nvfuser::Scalar* {
-            nvfuser::Scalar* out = self.defineScalar();
-            self.defineRecord(
-                new nvfuser::
-                    ConstantRecord<torch::jit::fuser::cuda::Double, double>(
-                        {out->index}, val));
+          [](nvfuser::FusionDefinition& self, double val) -> nvfuser::Scalar {
+            FUSER_PERF_SCOPE("FusionDefinition.define_constant (double)");
+            nvfuser::Scalar out = self.defineScalar();
+            self.defineRecord(new nvfuser::ConstantRecord<Nvf::Double, double>(
+                {self.recordingState(out())}, val));
             return out;
           },
           py::return_value_policy::reference)
       .def(
           "define_constant",
           [](nvfuser::FusionDefinition& self,
-             std::complex<double> val) -> nvfuser::Scalar* {
-            nvfuser::Scalar* out = self.defineScalar();
-            self.defineRecord(new nvfuser::ConstantRecord<
-                              torch::jit::fuser::cuda::ComplexDouble,
-                              c10::complex<double>>(
-                {out->index}, static_cast<c10::complex<double>>(val)));
+             std::complex<double> val) -> nvfuser::Scalar {
+            FUSER_PERF_SCOPE("FusionDefinition.define_constant (complex)");
+            nvfuser::Scalar out = self.defineScalar();
+            self.defineRecord(
+                new nvfuser::
+                    ConstantRecord<Nvf::ComplexDouble, c10::complex<double>>(
+                        {self.recordingState(out())},
+                        static_cast<c10::complex<double>>(val)));
             return out;
           },
           py::return_value_policy::reference)
       .def(
           "define_constant",
-          [](nvfuser::FusionDefinition& self, bool val) -> nvfuser::Scalar* {
-            nvfuser::Scalar* out = self.defineScalar();
-            self.defineRecord(
-                new nvfuser::
-                    ConstantRecord<torch::jit::fuser::cuda::Bool, bool>(
-                        {out->index}, val));
+          [](nvfuser::FusionDefinition& self, bool val) -> nvfuser::Scalar {
+            FUSER_PERF_SCOPE("FusionDefinition.define_constant (bool)");
+            nvfuser::Scalar out = self.defineScalar();
+            self.defineRecord(new nvfuser::ConstantRecord<Nvf::Bool, bool>(
+                {self.recordingState(out())}, val));
             return out;
           },
           py::return_value_policy::reference)
       .def(
           "define_constant",
-          [](nvfuser::FusionDefinition& self, int64_t val) -> nvfuser::Scalar* {
-            nvfuser::Scalar* out = self.defineScalar();
-            self.defineRecord(
-                new nvfuser::
-                    ConstantRecord<torch::jit::fuser::cuda::Int, int64_t>(
-                        {out->index}, val));
+          [](nvfuser::FusionDefinition& self, int64_t val) -> nvfuser::Scalar {
+            FUSER_PERF_SCOPE("FusionDefinition.define_constant (int)");
+            nvfuser::Scalar out = self.defineScalar();
+            self.defineRecord(new nvfuser::ConstantRecord<Nvf::Int, int64_t>(
+                {self.recordingState(out())}, val));
             return out;
           },
           py::return_value_policy::reference)
       .def(
           "define_scalar",
           [](nvfuser::FusionDefinition& self,
-             NvfDataType dtype = torch::jit::fuser::cuda::DataType::Double)
-              -> nvfuser::Scalar* {
-            nvfuser::Scalar* out = self.defineScalar();
-            self.defineRecord(new nvfuser::ScalarRecord({out->index}, dtype));
+             Nvf::DataType dtype = Nvf::DataType::Double) -> nvfuser::Scalar {
+            FUSER_PERF_SCOPE("FusionDefinition.define_scalar");
+            nvfuser::Scalar out = self.defineScalar();
+            self.defineRecord(
+                new nvfuser::ScalarRecord({self.recordingState(out())}, dtype));
             return out;
           },
-          py::arg("dtype") = torch::jit::fuser::cuda::DataType::Double,
+          py::arg("dtype") = Nvf::DataType::Double,
           py::return_value_policy::reference);
 
   //! The Operators class is a nested class of FusionDefinition to allow the
@@ -240,35 +314,39 @@ void initNvFuserPythonBindings(PyObject* module) {
   nvf_ops.def(py::init<nvfuser::FusionDefinition*>());
 
   // ******************** INSERT OP BINDINGS BELOW HERE ********************
-
-#define NVFUSER_PYTHON_BINDING_UNARY_OP(op_str, op_name)                  \
-  nvf_ops.def(                                                            \
-      op_str,                                                             \
-      [](nvfuser::FusionDefinition::Operators& self,                      \
-         nvfuser::Tensor* input) -> nvfuser::Tensor* {                    \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor(); \
-        self.fusion_definition->defineRecord(                             \
-            new nvfuser::OpRecord<NvfTensorView*, NvfTensorView*>(        \
-                {input->index},                                           \
-                {output->index},                                          \
-                static_cast<NvfTensorView* (*)(NvfTensorView*)>(          \
-                    torch::jit::fuser::cuda::op_name)));                  \
-        return output;                                                    \
-      },                                                                  \
-      py::return_value_policy::reference);                                \
-  nvf_ops.def(                                                            \
-      op_str,                                                             \
-      [](nvfuser::FusionDefinition::Operators& self,                      \
-         nvfuser::Scalar* input) -> nvfuser::Scalar* {                    \
-        nvfuser::Scalar* output = self.fusion_definition->defineScalar(); \
-        self.fusion_definition->defineRecord(                             \
-            new nvfuser::OpRecord<NvfVal*, NvfVal*>(                      \
-                {input->index},                                           \
-                {output->index},                                          \
-                static_cast<NvfVal* (*)(NvfVal*)>(                        \
-                    torch::jit::fuser::cuda::op_name)));                  \
-        return output;                                                    \
-      },                                                                  \
+#define OP_PREFIX "Operators."
+#define NVFUSER_PYTHON_BINDING_UNARY_OP(op_str, op_name)               \
+  nvf_ops.def(                                                         \
+      op_str,                                                          \
+      [](nvfuser::FusionDefinition::Operators& self,                   \
+         nvfuser::Tensor input) -> nvfuser::Tensor {                   \
+        FUSER_PERF_SCOPE("Operators." op_str);                         \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;        \
+        nvfuser::Tensor output = fd->defineTensor();                   \
+        fd->defineRecord(                                              \
+            new nvfuser::OpRecord<Nvf::TensorView*, Nvf::TensorView*>( \
+                {fd->recordingState(input())},                         \
+                {fd->recordingState(output())},                        \
+                ("ops." op_str),                                       \
+                static_cast<Nvf::TensorView* (*)(Nvf::TensorView*)>(   \
+                    Nvf::op_name)));                                   \
+        return output;                                                 \
+      },                                                               \
+      py::return_value_policy::reference);                             \
+  nvf_ops.def(                                                         \
+      op_str,                                                          \
+      [](nvfuser::FusionDefinition::Operators& self,                   \
+         nvfuser::Scalar input) -> nvfuser::Scalar {                   \
+        FUSER_PERF_SCOPE("Operators." op_str);                         \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;        \
+        nvfuser::Scalar output = fd->defineScalar();                   \
+        fd->defineRecord(new nvfuser::OpRecord<Nvf::Val*, Nvf::Val*>(  \
+            {fd->recordingState(input())},                             \
+            {fd->recordingState(output())},                            \
+            ("ops." op_str),                                           \
+            static_cast<Nvf::Val* (*)(Nvf::Val*)>(Nvf::op_name)));     \
+        return output;                                                 \
+      },                                                               \
       py::return_value_policy::reference);
 
   NVFUSER_PYTHON_BINDING_UNARY_OP("abs", abs)
@@ -317,68 +395,85 @@ void initNvFuserPythonBindings(PyObject* module) {
   NVFUSER_PYTHON_BINDING_UNARY_OP("imag", imag)
 #undef NVFUSER_PYTHON_BINDING_UNARY_OP
 
-#define NVFUSER_PYTHON_BINDING_BINARY_OP(op_str, op_name)                    \
-  nvf_ops.def(                                                               \
-      op_str,                                                                \
-      [](nvfuser::FusionDefinition::Operators& self,                         \
-         nvfuser::Tensor* arg1,                                              \
-         nvfuser::Tensor* arg2) -> nvfuser::Tensor* {                        \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();    \
-        self.fusion_definition->defineRecord(new nvfuser::OpRecord<          \
-                                             NvfTensorView*,                 \
-                                             NvfTensorView*,                 \
-                                             NvfTensorView*>(                \
-            {arg1->index, arg2->index},                                      \
-            {output->index},                                                 \
-            static_cast<NvfTensorView* (*)(NvfTensorView*, NvfTensorView*)>( \
-                torch::jit::fuser::cuda::op_name)));                         \
-        return output;                                                       \
-      },                                                                     \
-      py::return_value_policy::reference);                                   \
-  nvf_ops.def(                                                               \
-      op_str,                                                                \
-      [](nvfuser::FusionDefinition::Operators& self,                         \
-         nvfuser::Tensor* arg1,                                              \
-         nvfuser::Scalar* arg2) -> nvfuser::Tensor* {                        \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();    \
-        self.fusion_definition->defineRecord(                                \
-            new nvfuser::OpRecord<NvfTensorView*, NvfTensorView*, NvfVal*>(  \
-                {arg1->index, arg2->index},                                  \
-                {output->index},                                             \
-                static_cast<NvfTensorView* (*)(NvfTensorView*, NvfVal*)>(    \
-                    torch::jit::fuser::cuda::op_name)));                     \
-        return output;                                                       \
-      },                                                                     \
-      py::return_value_policy::reference);                                   \
-  nvf_ops.def(                                                               \
-      op_str,                                                                \
-      [](nvfuser::FusionDefinition::Operators& self,                         \
-         nvfuser::Scalar* arg1,                                              \
-         nvfuser::Tensor* arg2) -> nvfuser::Tensor* {                        \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();    \
-        self.fusion_definition->defineRecord(                                \
-            new nvfuser::OpRecord<NvfTensorView*, NvfVal*, NvfTensorView*>(  \
-                {arg1->index, arg2->index},                                  \
-                {output->index},                                             \
-                static_cast<NvfTensorView* (*)(NvfVal*, NvfTensorView*)>(    \
-                    torch::jit::fuser::cuda::op_name)));                     \
-        return output;                                                       \
-      },                                                                     \
-      py::return_value_policy::reference);                                   \
-  nvf_ops.def(                                                               \
-      op_str,                                                                \
-      [](nvfuser::FusionDefinition::Operators& self,                         \
-         nvfuser::Scalar* arg1,                                              \
-         nvfuser::Scalar* arg2) -> nvfuser::Scalar* {                        \
-        nvfuser::Scalar* output = self.fusion_definition->defineScalar();    \
-        self.fusion_definition->defineRecord(                                \
-            new nvfuser::OpRecord<NvfVal*, NvfVal*, NvfVal*>(                \
-                {arg1->index, arg2->index},                                  \
-                {output->index},                                             \
-                static_cast<NvfVal* (*)(NvfVal*, NvfVal*)>(                  \
-                    torch::jit::fuser::cuda::op_name)));                     \
-        return output;                                                       \
-      },                                                                     \
+#define NVFUSER_PYTHON_BINDING_BINARY_OP(op_str, op_name)                   \
+  nvf_ops.def(                                                              \
+      op_str,                                                               \
+      [](nvfuser::FusionDefinition::Operators& self,                        \
+         nvfuser::Tensor arg1,                                              \
+         nvfuser::Tensor arg2) -> nvfuser::Tensor {                         \
+        FUSER_PERF_SCOPE("Operators." op_str);                              \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;             \
+        nvfuser::Tensor output = fd->defineTensor();                        \
+        fd->defineRecord(new nvfuser::OpRecord<                             \
+                         Nvf::TensorView*,                                  \
+                         Nvf::TensorView*,                                  \
+                         Nvf::TensorView*>(                                 \
+            {fd->recordingState(arg1()), fd->recordingState(arg2())},       \
+            {fd->recordingState(output())},                                 \
+            ("ops." op_str),                                                \
+            static_cast<                                                    \
+                Nvf::TensorView* (*)(Nvf::TensorView*, Nvf::TensorView*)>(  \
+                Nvf::op_name)));                                            \
+        return output;                                                      \
+      },                                                                    \
+      py::return_value_policy::reference);                                  \
+  nvf_ops.def(                                                              \
+      op_str,                                                               \
+      [](nvfuser::FusionDefinition::Operators& self,                        \
+         nvfuser::Tensor arg1,                                              \
+         nvfuser::Scalar arg2) -> nvfuser::Tensor {                         \
+        FUSER_PERF_SCOPE("Operators." op_str);                              \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;             \
+        nvfuser::Tensor output = fd->defineTensor();                        \
+        fd->defineRecord(new nvfuser::OpRecord<                             \
+                         Nvf::TensorView*,                                  \
+                         Nvf::TensorView*,                                  \
+                         Nvf::Val*>(                                        \
+            {fd->recordingState(arg1()), fd->recordingState(arg2())},       \
+            {fd->recordingState(output())},                                 \
+            ("ops." op_str),                                                \
+            static_cast<Nvf::TensorView* (*)(Nvf::TensorView*, Nvf::Val*)>( \
+                Nvf::op_name)));                                            \
+        return output;                                                      \
+      },                                                                    \
+      py::return_value_policy::reference);                                  \
+  nvf_ops.def(                                                              \
+      op_str,                                                               \
+      [](nvfuser::FusionDefinition::Operators& self,                        \
+         nvfuser::Scalar arg1,                                              \
+         nvfuser::Tensor arg2) -> nvfuser::Tensor {                         \
+        FUSER_PERF_SCOPE("Operators." op_str);                              \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;             \
+        nvfuser::Tensor output = fd->defineTensor();                        \
+        fd->defineRecord(new nvfuser::OpRecord<                             \
+                         Nvf::TensorView*,                                  \
+                         Nvf::Val*,                                         \
+                         Nvf::TensorView*>(                                 \
+            {fd->recordingState(arg1()), fd->recordingState(arg2())},       \
+            {fd->recordingState(output())},                                 \
+            ("ops." op_str),                                                \
+            static_cast<Nvf::TensorView* (*)(Nvf::Val*, Nvf::TensorView*)>( \
+                Nvf::op_name)));                                            \
+        return output;                                                      \
+      },                                                                    \
+      py::return_value_policy::reference);                                  \
+  nvf_ops.def(                                                              \
+      op_str,                                                               \
+      [](nvfuser::FusionDefinition::Operators& self,                        \
+         nvfuser::Scalar arg1,                                              \
+         nvfuser::Scalar arg2) -> nvfuser::Scalar {                         \
+        FUSER_PERF_SCOPE("Operators." op_str);                              \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;             \
+        nvfuser::Scalar output = fd->defineScalar();                        \
+        fd->defineRecord(                                                   \
+            new nvfuser::OpRecord<Nvf::Val*, Nvf::Val*, Nvf::Val*>(         \
+                {fd->recordingState(arg1()), fd->recordingState(arg2())},   \
+                {fd->recordingState(output())},                             \
+                ("ops." op_str),                                            \
+                static_cast<Nvf::Val* (*)(Nvf::Val*, Nvf::Val*)>(           \
+                    Nvf::op_name)));                                        \
+        return output;                                                      \
+      },                                                                    \
       py::return_value_policy::reference);
 
   NVFUSER_PYTHON_BINDING_BINARY_OP("add", add)
@@ -403,236 +498,311 @@ void initNvFuserPythonBindings(PyObject* module) {
   NVFUSER_PYTHON_BINDING_BINARY_OP("bitwise_right_shift", bitwise_left_shift)
 #undef NVFUSER_PYTHON_BINDING_BINARY_OP
 
-#define NVFUSER_PYTHON_BINDING_BINARY_WITH_ALPHA_OP(op_str, op_name)           \
-  nvf_ops.def(                                                                 \
-      op_str,                                                                  \
-      [](nvfuser::FusionDefinition::Operators& self,                           \
-         nvfuser::Tensor* arg1,                                                \
-         nvfuser::Tensor* arg2,                                                \
-         nvfuser::Scalar* arg3) -> nvfuser::Tensor* {                          \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();      \
-        self.fusion_definition->defineRecord(new nvfuser::OpRecord<            \
-                                             NvfTensorView*,                   \
-                                             NvfTensorView*,                   \
-                                             NvfTensorView*,                   \
-                                             NvfVal*>(                         \
-            {arg1->index, arg2->index, arg3->index},                           \
-            {output->index},                                                   \
-            static_cast<                                                       \
-                NvfTensorView* (*)(NvfTensorView*, NvfTensorView*, NvfVal*)>(  \
-                torch::jit::fuser::cuda::op_name)));                           \
-        return output;                                                         \
-      },                                                                       \
-      py::return_value_policy::reference);                                     \
-  nvf_ops.def(                                                                 \
-      op_str,                                                                  \
-      [](nvfuser::FusionDefinition::Operators& self,                           \
-         nvfuser::Tensor* arg1,                                                \
-         nvfuser::Scalar* arg2,                                                \
-         nvfuser::Scalar* arg3) -> nvfuser::Tensor* {                          \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();      \
-        self.fusion_definition->defineRecord(                                  \
-            new nvfuser::                                                      \
-                OpRecord<NvfTensorView*, NvfTensorView*, NvfVal*, NvfVal*>(    \
-                    {arg1->index, arg2->index, arg3->index},                   \
-                    {output->index},                                           \
-                    static_cast<                                               \
-                        NvfTensorView* (*)(NvfTensorView*, NvfVal*, NvfVal*)>( \
-                        torch::jit::fuser::cuda::op_name)));                   \
-        return output;                                                         \
-      },                                                                       \
-      py::return_value_policy::reference);                                     \
-  nvf_ops.def(                                                                 \
-      op_str,                                                                  \
-      [](nvfuser::FusionDefinition::Operators& self,                           \
-         nvfuser::Scalar* arg1,                                                \
-         nvfuser::Tensor* arg2,                                                \
-         nvfuser::Scalar* arg3) -> nvfuser::Tensor* {                          \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();      \
-        self.fusion_definition->defineRecord(                                  \
-            new nvfuser::                                                      \
-                OpRecord<NvfTensorView*, NvfVal*, NvfTensorView*, NvfVal*>(    \
-                    {arg1->index, arg2->index, arg3->index},                   \
-                    {output->index},                                           \
-                    static_cast<                                               \
-                        NvfTensorView* (*)(NvfVal*, NvfTensorView*, NvfVal*)>( \
-                        torch::jit::fuser::cuda::op_name)));                   \
-        return output;                                                         \
-      },                                                                       \
-      py::return_value_policy::reference);                                     \
-  nvf_ops.def(                                                                 \
-      op_str,                                                                  \
-      [](nvfuser::FusionDefinition::Operators& self,                           \
-         nvfuser::Scalar* arg1,                                                \
-         nvfuser::Scalar* arg2,                                                \
-         nvfuser::Scalar* arg3) -> nvfuser::Scalar* {                          \
-        nvfuser::Scalar* output = self.fusion_definition->defineScalar();      \
-        self.fusion_definition->defineRecord(                                  \
-            new nvfuser::OpRecord<NvfVal*, NvfVal*, NvfVal*, NvfVal*>(         \
-                {arg1->index, arg2->index, arg3->index},                       \
-                {output->index},                                               \
-                static_cast<NvfVal* (*)(NvfVal*, NvfVal*, NvfVal*)>(           \
-                    torch::jit::fuser::cuda::op_name)));                       \
-        return output;                                                         \
-      },                                                                       \
-      py::return_value_policy::reference);
-
-  NVFUSER_PYTHON_BINDING_BINARY_WITH_ALPHA_OP("add_alpha", add_alpha)
-  NVFUSER_PYTHON_BINDING_BINARY_WITH_ALPHA_OP("sub_alpha", sub_alpha)
-#undef NVFUSER_PYTHON_BINDING_BINARY_WITH_ALPHA_OP
-
-#define NVFUSER_PYTHON_BINDING_TERNARY_OP(op_str, op_name)                           \
-  nvf_ops.def(                                                                       \
-      op_str,                                                                        \
-      [](nvfuser::FusionDefinition::Operators& self,                                 \
-         nvfuser::Scalar* arg1,                                                      \
-         nvfuser::Scalar* arg2,                                                      \
-         nvfuser::Scalar* arg3) -> nvfuser::Scalar* {                                \
-        nvfuser::Scalar* output = self.fusion_definition->defineScalar();            \
-        self.fusion_definition->defineRecord(                                        \
-            new nvfuser::OpRecord<NvfVal*, NvfVal*, NvfVal*, NvfVal*>(               \
-                {arg1->index, arg2->index, arg3->index},                             \
-                {output->index},                                                     \
-                static_cast<NvfVal* (*)(NvfVal*, NvfVal*, NvfVal*)>(                 \
-                    torch::jit::fuser::cuda::op_name)));                             \
-        return output;                                                               \
-      },                                                                             \
-      py::return_value_policy::reference);                                           \
-  nvf_ops.def(                                                                       \
-      op_str,                                                                        \
-      [](nvfuser::FusionDefinition::Operators& self,                                 \
-         nvfuser::Tensor* arg1,                                                      \
-         nvfuser::Tensor* arg2,                                                      \
-         nvfuser::Tensor* arg3) -> nvfuser::Tensor* {                                \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();            \
-        self.fusion_definition->defineRecord(new nvfuser::OpRecord<                  \
-                                             NvfTensorView*,                         \
-                                             NvfTensorView*,                         \
-                                             NvfTensorView*,                         \
-                                             NvfTensorView*>(                        \
-            {arg1->index, arg2->index, arg3->index},                                 \
-            {output->index},                                                         \
-            static_cast<                                                             \
-                NvfTensorView* (*)(NvfTensorView*, NvfTensorView*, NvfTensorView*)>( \
-                torch::jit::fuser::cuda::op_name)));                                 \
-        return output;                                                               \
-      },                                                                             \
-      py::return_value_policy::reference);                                           \
+#define NVFUSER_PYTHON_BINDING_BINARY_WITH_ALPHA_OP(op_str, op_name)                 \
   nvf_ops.def(                                                                       \
       op_str,                                                                        \
       [](nvfuser::FusionDefinition::Operators& self,                                 \
-         nvfuser::Tensor* arg1,                                                      \
-         nvfuser::Tensor* arg2,                                                      \
-         nvfuser::Scalar* arg3) -> nvfuser::Tensor* {                                \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();            \
-        self.fusion_definition->defineRecord(new nvfuser::OpRecord<                  \
-                                             NvfTensorView*,                         \
-                                             NvfTensorView*,                         \
-                                             NvfTensorView*,                         \
-                                             NvfVal*>(                               \
-            {arg1->index, arg2->index, arg3->index},                                 \
-            {output->index},                                                         \
+         nvfuser::Tensor arg1,                                                       \
+         nvfuser::Tensor arg2,                                                       \
+         nvfuser::Scalar arg3) -> nvfuser::Tensor {                                  \
+        FUSER_PERF_SCOPE("Operators." op_str);                                       \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                      \
+        nvfuser::Tensor output = fd->defineTensor();                                 \
+        fd->defineRecord(new nvfuser::OpRecord<                                      \
+                         Nvf::TensorView*,                                           \
+                         Nvf::TensorView*,                                           \
+                         Nvf::TensorView*,                                           \
+                         Nvf::Val*>(                                                 \
+            {fd->recordingState(arg1()),                                             \
+             fd->recordingState(arg2()),                                             \
+             fd->recordingState(arg3())},                                            \
+            {fd->recordingState(output())},                                          \
+            ("ops." op_str),                                                         \
             static_cast<                                                             \
-                NvfTensorView* (*)(NvfTensorView*, NvfTensorView*, NvfVal*)>(        \
-                torch::jit::fuser::cuda::op_name)));                                 \
+                Nvf::                                                                \
+                    TensorView* (*)(Nvf::TensorView*, Nvf::TensorView*, Nvf::Val*)>( \
+                Nvf::op_name)));                                                     \
         return output;                                                               \
       },                                                                             \
       py::return_value_policy::reference);                                           \
   nvf_ops.def(                                                                       \
       op_str,                                                                        \
       [](nvfuser::FusionDefinition::Operators& self,                                 \
-         nvfuser::Tensor* arg1,                                                      \
-         nvfuser::Scalar* arg2,                                                      \
-         nvfuser::Tensor* arg3) -> nvfuser::Tensor* {                                \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();            \
-        self.fusion_definition->defineRecord(new nvfuser::OpRecord<                  \
-                                             NvfTensorView*,                         \
-                                             NvfTensorView*,                         \
-                                             NvfVal*,                                \
-                                             NvfTensorView*>(                        \
-            {arg1->index, arg2->index, arg3->index},                                 \
-            {output->index},                                                         \
+         nvfuser::Tensor arg1,                                                       \
+         nvfuser::Scalar arg2,                                                       \
+         nvfuser::Scalar arg3) -> nvfuser::Tensor {                                  \
+        FUSER_PERF_SCOPE("Operators." op_str);                                       \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                      \
+        nvfuser::Tensor output = fd->defineTensor();                                 \
+        fd->defineRecord(new nvfuser::OpRecord<                                      \
+                         Nvf::TensorView*,                                           \
+                         Nvf::TensorView*,                                           \
+                         Nvf::Val*,                                                  \
+                         Nvf::Val*>(                                                 \
+            {fd->recordingState(arg1()),                                             \
+             fd->recordingState(arg2()),                                             \
+             fd->recordingState(arg3())},                                            \
+            {fd->recordingState(output())},                                          \
+            ("ops." op_str),                                                         \
             static_cast<                                                             \
-                NvfTensorView* (*)(NvfTensorView*, NvfVal*, NvfTensorView*)>(        \
-                torch::jit::fuser::cuda::op_name)));                                 \
+                Nvf::TensorView* (*)(Nvf::TensorView*, Nvf::Val*, Nvf::Val*)>(       \
+                Nvf::op_name)));                                                     \
         return output;                                                               \
       },                                                                             \
       py::return_value_policy::reference);                                           \
   nvf_ops.def(                                                                       \
       op_str,                                                                        \
       [](nvfuser::FusionDefinition::Operators& self,                                 \
-         nvfuser::Scalar* arg1,                                                      \
-         nvfuser::Tensor* arg2,                                                      \
-         nvfuser::Tensor* arg3) -> nvfuser::Tensor* {                                \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();            \
-        self.fusion_definition->defineRecord(new nvfuser::OpRecord<                  \
-                                             NvfTensorView*,                         \
-                                             NvfVal*,                                \
-                                             NvfTensorView*,                         \
-                                             NvfTensorView*>(                        \
-            {arg1->index, arg2->index, arg3->index},                                 \
-            {output->index},                                                         \
+         nvfuser::Scalar arg1,                                                       \
+         nvfuser::Tensor arg2,                                                       \
+         nvfuser::Scalar arg3) -> nvfuser::Tensor {                                  \
+        FUSER_PERF_SCOPE("Operators." op_str);                                       \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                      \
+        nvfuser::Tensor output = fd->defineTensor();                                 \
+        fd->defineRecord(new nvfuser::OpRecord<                                      \
+                         Nvf::TensorView*,                                           \
+                         Nvf::Val*,                                                  \
+                         Nvf::TensorView*,                                           \
+                         Nvf::Val*>(                                                 \
+            {fd->recordingState(arg1()),                                             \
+             fd->recordingState(arg2()),                                             \
+             fd->recordingState(arg3())},                                            \
+            {fd->recordingState(output())},                                          \
+            ("ops." op_str),                                                         \
             static_cast<                                                             \
-                NvfTensorView* (*)(NvfVal*, NvfTensorView*, NvfTensorView*)>(        \
-                torch::jit::fuser::cuda::op_name)));                                 \
-        return output;                                                               \
-      },                                                                             \
-      py::return_value_policy::reference);                                           \
-  nvf_ops.def(                                                                       \
-      op_str,                                                                        \
-      [](nvfuser::FusionDefinition::Operators& self,                                 \
-         nvfuser::Scalar* arg1,                                                      \
-         nvfuser::Scalar* arg2,                                                      \
-         nvfuser::Tensor* arg3) -> nvfuser::Tensor* {                                \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();            \
-        self.fusion_definition->defineRecord(                                        \
-            new nvfuser::                                                            \
-                OpRecord<NvfTensorView*, NvfVal*, NvfVal*, NvfTensorView*>(          \
-                    {arg1->index, arg2->index, arg3->index},                         \
-                    {output->index},                                                 \
-                    static_cast<                                                     \
-                        NvfTensorView* (*)(NvfVal*, NvfVal*, NvfTensorView*)>(       \
-                        torch::jit::fuser::cuda::op_name)));                         \
-        return output;                                                               \
-      },                                                                             \
-      py::return_value_policy::reference);                                           \
-  nvf_ops.def(                                                                       \
-      op_str,                                                                        \
-      [](nvfuser::FusionDefinition::Operators& self,                                 \
-         nvfuser::Tensor* arg1,                                                      \
-         nvfuser::Scalar* arg2,                                                      \
-         nvfuser::Scalar* arg3) -> nvfuser::Tensor* {                                \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();            \
-        self.fusion_definition->defineRecord(                                        \
-            new nvfuser::                                                            \
-                OpRecord<NvfTensorView*, NvfTensorView*, NvfVal*, NvfVal*>(          \
-                    {arg1->index, arg2->index, arg3->index},                         \
-                    {output->index},                                                 \
-                    static_cast<                                                     \
-                        NvfTensorView* (*)(NvfTensorView*, NvfVal*, NvfVal*)>(       \
-                        torch::jit::fuser::cuda::op_name)));                         \
+                Nvf::TensorView* (*)(Nvf::Val*, Nvf::TensorView*, Nvf::Val*)>(       \
+                Nvf::op_name)));                                                     \
         return output;                                                               \
       },                                                                             \
       py::return_value_policy::reference);                                           \
   nvf_ops.def(                                                                       \
       op_str,                                                                        \
       [](nvfuser::FusionDefinition::Operators& self,                                 \
-         nvfuser::Scalar* arg1,                                                      \
-         nvfuser::Tensor* arg2,                                                      \
-         nvfuser::Scalar* arg3) -> nvfuser::Tensor* {                                \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();            \
-        self.fusion_definition->defineRecord(                                        \
-            new nvfuser::                                                            \
-                OpRecord<NvfTensorView*, NvfVal*, NvfTensorView*, NvfVal*>(          \
-                    {arg1->index, arg2->index, arg3->index},                         \
-                    {output->index},                                                 \
-                    static_cast<                                                     \
-                        NvfTensorView* (*)(NvfVal*, NvfTensorView*, NvfVal*)>(       \
-                        torch::jit::fuser::cuda::op_name)));                         \
+         nvfuser::Scalar arg1,                                                       \
+         nvfuser::Scalar arg2,                                                       \
+         nvfuser::Scalar arg3) -> nvfuser::Scalar {                                  \
+        FUSER_PERF_SCOPE("Operators." op_str);                                       \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                      \
+        nvfuser::Scalar output = fd->defineScalar();                                 \
+        fd->defineRecord(                                                            \
+            new nvfuser::OpRecord<Nvf::Val*, Nvf::Val*, Nvf::Val*, Nvf::Val*>(       \
+                {fd->recordingState(arg1()),                                         \
+                 fd->recordingState(arg2()),                                         \
+                 fd->recordingState(arg3())},                                        \
+                {fd->recordingState(output())},                                      \
+                ("ops." op_str),                                                     \
+                static_cast<Nvf::Val* (*)(Nvf::Val*, Nvf::Val*, Nvf::Val*)>(         \
+                    Nvf::op_name)));                                                 \
         return output;                                                               \
       },                                                                             \
       py::return_value_policy::reference);
 
+  NVFUSER_PYTHON_BINDING_BINARY_WITH_ALPHA_OP("add_alpha", add_alpha)
+  NVFUSER_PYTHON_BINDING_BINARY_WITH_ALPHA_OP("sub_alpha", sub_alpha)
+#undef NVFUSER_PYTHON_BINDING_BINARY_WITH_ALPHA_OP
+
+#define NVFUSER_PYTHON_BINDING_TERNARY_OP(op_str, op_name)                                  \
+  nvf_ops.def(                                                                              \
+      op_str,                                                                               \
+      [](nvfuser::FusionDefinition::Operators& self,                                        \
+         nvfuser::Scalar arg1,                                                              \
+         nvfuser::Scalar arg2,                                                              \
+         nvfuser::Scalar arg3) -> nvfuser::Scalar {                                         \
+        FUSER_PERF_SCOPE("Operators." op_str);                                              \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                             \
+        nvfuser::Scalar output = fd->defineScalar();                                        \
+        fd->defineRecord(                                                                   \
+            new nvfuser::OpRecord<Nvf::Val*, Nvf::Val*, Nvf::Val*, Nvf::Val*>(              \
+                {fd->recordingState(arg1()),                                                \
+                 fd->recordingState(arg2()),                                                \
+                 fd->recordingState(arg3())},                                               \
+                {fd->recordingState(output())},                                             \
+                ("ops." op_str),                                                            \
+                static_cast<Nvf::Val* (*)(Nvf::Val*, Nvf::Val*, Nvf::Val*)>(                \
+                    Nvf::op_name)));                                                        \
+        return output;                                                                      \
+      },                                                                                    \
+      py::return_value_policy::reference);                                                  \
+  nvf_ops.def(                                                                              \
+      op_str,                                                                               \
+      [](nvfuser::FusionDefinition::Operators& self,                                        \
+         nvfuser::Tensor arg1,                                                              \
+         nvfuser::Tensor arg2,                                                              \
+         nvfuser::Tensor arg3) -> nvfuser::Tensor {                                         \
+        FUSER_PERF_SCOPE("Operators." op_str);                                              \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                             \
+        nvfuser::Tensor output = fd->defineTensor();                                        \
+        fd->defineRecord(new nvfuser::OpRecord<                                             \
+                         Nvf::TensorView*,                                                  \
+                         Nvf::TensorView*,                                                  \
+                         Nvf::TensorView*,                                                  \
+                         Nvf::TensorView*>(                                                 \
+            {fd->recordingState(arg1()),                                                    \
+             fd->recordingState(arg2()),                                                    \
+             fd->recordingState(arg3())},                                                   \
+            {fd->recordingState(output())},                                                 \
+            ("ops." op_str),                                                                \
+            static_cast<                                                                    \
+                Nvf::                                                                       \
+                    TensorView* (*)(Nvf::TensorView*, Nvf::TensorView*, Nvf::TensorView*)>( \
+                Nvf::op_name)));                                                            \
+        return output;                                                                      \
+      },                                                                                    \
+      py::return_value_policy::reference);                                                  \
+  nvf_ops.def(                                                                              \
+      op_str,                                                                               \
+      [](nvfuser::FusionDefinition::Operators& self,                                        \
+         nvfuser::Tensor arg1,                                                              \
+         nvfuser::Tensor arg2,                                                              \
+         nvfuser::Scalar arg3) -> nvfuser::Tensor {                                         \
+        FUSER_PERF_SCOPE("Operators." op_str);                                              \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                             \
+        nvfuser::Tensor output = fd->defineTensor();                                        \
+        fd->defineRecord(new nvfuser::OpRecord<                                             \
+                         Nvf::TensorView*,                                                  \
+                         Nvf::TensorView*,                                                  \
+                         Nvf::TensorView*,                                                  \
+                         Nvf::Val*>(                                                        \
+            {fd->recordingState(arg1()),                                                    \
+             fd->recordingState(arg2()),                                                    \
+             fd->recordingState(arg3())},                                                   \
+            {fd->recordingState(output())},                                                 \
+            ("ops." op_str),                                                                \
+            static_cast<                                                                    \
+                Nvf::                                                                       \
+                    TensorView* (*)(Nvf::TensorView*, Nvf::TensorView*, Nvf::Val*)>(        \
+                Nvf::op_name)));                                                            \
+        return output;                                                                      \
+      },                                                                                    \
+      py::return_value_policy::reference);                                                  \
+  nvf_ops.def(                                                                              \
+      op_str,                                                                               \
+      [](nvfuser::FusionDefinition::Operators& self,                                        \
+         nvfuser::Tensor arg1,                                                              \
+         nvfuser::Scalar arg2,                                                              \
+         nvfuser::Tensor arg3) -> nvfuser::Tensor {                                         \
+        FUSER_PERF_SCOPE("Operators." op_str);                                              \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                             \
+        nvfuser::Tensor output = fd->defineTensor();                                        \
+        fd->defineRecord(new nvfuser::OpRecord<                                             \
+                         Nvf::TensorView*,                                                  \
+                         Nvf::TensorView*,                                                  \
+                         Nvf::Val*,                                                         \
+                         Nvf::TensorView*>(                                                 \
+            {fd->recordingState(arg1()),                                                    \
+             fd->recordingState(arg2()),                                                    \
+             fd->recordingState(arg3())},                                                   \
+            {fd->recordingState(output())},                                                 \
+            ("ops." op_str),                                                                \
+            static_cast<                                                                    \
+                Nvf::                                                                       \
+                    TensorView* (*)(Nvf::TensorView*, Nvf::Val*, Nvf::TensorView*)>(        \
+                Nvf::op_name)));                                                            \
+        return output;                                                                      \
+      },                                                                                    \
+      py::return_value_policy::reference);                                                  \
+  nvf_ops.def(                                                                              \
+      op_str,                                                                               \
+      [](nvfuser::FusionDefinition::Operators& self,                                        \
+         nvfuser::Scalar arg1,                                                              \
+         nvfuser::Tensor arg2,                                                              \
+         nvfuser::Tensor arg3) -> nvfuser::Tensor {                                         \
+        FUSER_PERF_SCOPE("Operators." op_str);                                              \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                             \
+        nvfuser::Tensor output = fd->defineTensor();                                        \
+        fd->defineRecord(new nvfuser::OpRecord<                                             \
+                         Nvf::TensorView*,                                                  \
+                         Nvf::Val*,                                                         \
+                         Nvf::TensorView*,                                                  \
+                         Nvf::TensorView*>(                                                 \
+            {fd->recordingState(arg1()),                                                    \
+             fd->recordingState(arg2()),                                                    \
+             fd->recordingState(arg3())},                                                   \
+            {fd->recordingState(output())},                                                 \
+            ("ops." op_str),                                                                \
+            static_cast<                                                                    \
+                Nvf::                                                                       \
+                    TensorView* (*)(Nvf::Val*, Nvf::TensorView*, Nvf::TensorView*)>(        \
+                Nvf::op_name)));                                                            \
+        return output;                                                                      \
+      },                                                                                    \
+      py::return_value_policy::reference);                                                  \
+  nvf_ops.def(                                                                              \
+      op_str,                                                                               \
+      [](nvfuser::FusionDefinition::Operators& self,                                        \
+         nvfuser::Scalar arg1,                                                              \
+         nvfuser::Scalar arg2,                                                              \
+         nvfuser::Tensor arg3) -> nvfuser::Tensor {                                         \
+        FUSER_PERF_SCOPE("Operators." op_str);                                              \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                             \
+        nvfuser::Tensor output = fd->defineTensor();                                        \
+        fd->defineRecord(new nvfuser::OpRecord<                                             \
+                         Nvf::TensorView*,                                                  \
+                         Nvf::Val*,                                                         \
+                         Nvf::Val*,                                                         \
+                         Nvf::TensorView*>(                                                 \
+            {fd->recordingState(arg1()),                                                    \
+             fd->recordingState(arg2()),                                                    \
+             fd->recordingState(arg3())},                                                   \
+            {fd->recordingState(output())},                                                 \
+            ("ops." op_str),                                                                \
+            static_cast<                                                                    \
+                Nvf::TensorView* (*)(Nvf::Val*, Nvf::Val*, Nvf::TensorView*)>(              \
+                Nvf::op_name)));                                                            \
+        return output;                                                                      \
+      },                                                                                    \
+      py::return_value_policy::reference);                                                  \
+  nvf_ops.def(                                                                              \
+      op_str,                                                                               \
+      [](nvfuser::FusionDefinition::Operators& self,                                        \
+         nvfuser::Tensor arg1,                                                              \
+         nvfuser::Scalar arg2,                                                              \
+         nvfuser::Scalar arg3) -> nvfuser::Tensor {                                         \
+        FUSER_PERF_SCOPE("Operators." op_str);                                              \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                             \
+        nvfuser::Tensor output = fd->defineTensor();                                        \
+        fd->defineRecord(new nvfuser::OpRecord<                                             \
+                         Nvf::TensorView*,                                                  \
+                         Nvf::TensorView*,                                                  \
+                         Nvf::Val*,                                                         \
+                         Nvf::Val*>(                                                        \
+            {fd->recordingState(arg1()),                                                    \
+             fd->recordingState(arg2()),                                                    \
+             fd->recordingState(arg3())},                                                   \
+            {fd->recordingState(output())},                                                 \
+            ("ops." op_str),                                                                \
+            static_cast<                                                                    \
+                Nvf::TensorView* (*)(Nvf::TensorView*, Nvf::Val*, Nvf::Val*)>(              \
+                Nvf::op_name)));                                                            \
+        return output;                                                                      \
+      },                                                                                    \
+      py::return_value_policy::reference);                                                  \
+  nvf_ops.def(                                                                              \
+      op_str,                                                                               \
+      [](nvfuser::FusionDefinition::Operators& self,                                        \
+         nvfuser::Scalar arg1,                                                              \
+         nvfuser::Tensor arg2,                                                              \
+         nvfuser::Scalar arg3) -> nvfuser::Tensor {                                         \
+        FUSER_PERF_SCOPE("Operators." op_str);                                              \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                             \
+        nvfuser::Tensor output = fd->defineTensor();                                        \
+        fd->defineRecord(new nvfuser::OpRecord<                                             \
+                         Nvf::TensorView*,                                                  \
+                         Nvf::Val*,                                                         \
+                         Nvf::TensorView*,                                                  \
+                         Nvf::Val*>(                                                        \
+            {fd->recordingState(arg1()),                                                    \
+             fd->recordingState(arg2()),                                                    \
+             fd->recordingState(arg3())},                                                   \
+            {fd->recordingState(output())},                                                 \
+            ("ops." op_str),                                                                \
+            static_cast<                                                                    \
+                Nvf::TensorView* (*)(Nvf::Val*, Nvf::TensorView*, Nvf::Val*)>(              \
+                Nvf::op_name)));                                                            \
+        return output;                                                                      \
+      },                                                                                    \
+      py::return_value_policy::reference);
+
   NVFUSER_PYTHON_BINDING_TERNARY_OP("lerp", lerp)
   NVFUSER_PYTHON_BINDING_TERNARY_OP("where", where)
 #undef NVFUSER_PYTHON_BINDING_TERNARY_OP
@@ -641,34 +811,46 @@ void initNvFuserPythonBindings(PyObject* module) {
   nvf_ops.def(                                                                 \
       op_str,                                                                  \
       [](nvfuser::FusionDefinition::Operators& self,                           \
-         nvfuser::Scalar* arg1,                                                \
-         nvfuser::Scalar* arg2,                                                \
-         nvfuser::Scalar* arg3) -> nvfuser::Scalar* {                          \
-        nvfuser::Scalar* output = self.fusion_definition->defineScalar();      \
-        self.fusion_definition->defineRecord(                                  \
-            new nvfuser::OpRecord<NvfVal*, NvfVal*, NvfVal*, NvfVal*>(         \
-                {arg1->index, arg2->index, arg3->index},                       \
-                {output->index},                                               \
-                static_cast<NvfVal* (*)(NvfVal*, NvfVal*, NvfVal*)>(           \
-                    torch::jit::fuser::cuda::op_name)));                       \
+         nvfuser::Scalar arg1,                                                 \
+         nvfuser::Scalar arg2,                                                 \
+         nvfuser::Scalar arg3) -> nvfuser::Scalar {                            \
+        FUSER_PERF_SCOPE("Operators." op_str);                                 \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                \
+        nvfuser::Scalar output = fd->defineScalar();                           \
+        fd->defineRecord(                                                      \
+            new nvfuser::OpRecord<Nvf::Val*, Nvf::Val*, Nvf::Val*, Nvf::Val*>( \
+                {fd->recordingState(arg1()),                                   \
+                 fd->recordingState(arg2()),                                   \
+                 fd->recordingState(arg3())},                                  \
+                {fd->recordingState(output())},                                \
+                ("ops." op_str),                                               \
+                static_cast<Nvf::Val* (*)(Nvf::Val*, Nvf::Val*, Nvf::Val*)>(   \
+                    Nvf::op_name)));                                           \
         return output;                                                         \
       },                                                                       \
       py::return_value_policy::reference);                                     \
   nvf_ops.def(                                                                 \
       op_str,                                                                  \
       [](nvfuser::FusionDefinition::Operators& self,                           \
-         nvfuser::Tensor* arg1,                                                \
-         nvfuser::Scalar* arg2,                                                \
-         nvfuser::Scalar* arg3) -> nvfuser::Tensor* {                          \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();      \
-        self.fusion_definition->defineRecord(                                  \
-            new nvfuser::                                                      \
-                OpRecord<NvfTensorView*, NvfTensorView*, NvfVal*, NvfVal*>(    \
-                    {arg1->index, arg2->index, arg3->index},                   \
-                    {output->index},                                           \
-                    static_cast<                                               \
-                        NvfTensorView* (*)(NvfTensorView*, NvfVal*, NvfVal*)>( \
-                        torch::jit::fuser::cuda::op_name)));                   \
+         nvfuser::Tensor arg1,                                                 \
+         nvfuser::Scalar arg2,                                                 \
+         nvfuser::Scalar arg3) -> nvfuser::Tensor {                            \
+        FUSER_PERF_SCOPE("Operators." op_str);                                 \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                \
+        nvfuser::Tensor output = fd->defineTensor();                           \
+        fd->defineRecord(new nvfuser::OpRecord<                                \
+                         Nvf::TensorView*,                                     \
+                         Nvf::TensorView*,                                     \
+                         Nvf::Val*,                                            \
+                         Nvf::Val*>(                                           \
+            {fd->recordingState(arg1()),                                       \
+             fd->recordingState(arg2()),                                       \
+             fd->recordingState(arg3())},                                      \
+            {fd->recordingState(output())},                                    \
+            ("ops." op_str),                                                   \
+            static_cast<                                                       \
+                Nvf::TensorView* (*)(Nvf::TensorView*, Nvf::Val*, Nvf::Val*)>( \
+                Nvf::op_name)));                                               \
         return output;                                                         \
       },                                                                       \
       py::return_value_policy::reference);
@@ -677,206 +859,270 @@ void initNvFuserPythonBindings(PyObject* module) {
   NVFUSER_PYTHON_BINDING_THRESHOLD_LIKE_OP("threshold", threshold)
 #undef NVFUSER_PYTHON_BINDING_THRESHOLD_LIKE_OP
 
-#define NVFUSER_PYTHON_BINDING_TERNARY_WITH_ALPHA_OP(op_str, op_name)                         \
-  nvf_ops.def(                                                                                \
-      op_str,                                                                                 \
-      [](nvfuser::FusionDefinition::Operators& self,                                          \
-         nvfuser::Scalar* arg1,                                                               \
-         nvfuser::Scalar* arg2,                                                               \
-         nvfuser::Scalar* arg3,                                                               \
-         nvfuser::Scalar* arg4) -> nvfuser::Scalar* {                                         \
-        nvfuser::Scalar* output = self.fusion_definition->defineScalar();                     \
-        self.fusion_definition->defineRecord(                                                 \
-            new nvfuser::                                                                     \
-                OpRecord<NvfVal*, NvfVal*, NvfVal*, NvfVal*, NvfVal*>(                        \
-                    {arg1->index, arg2->index, arg3->index, arg4->index},                     \
-                    {output->index},                                                          \
-                    static_cast<                                                              \
-                        NvfVal* (*)(NvfVal*, NvfVal*, NvfVal*, NvfVal*)>(                     \
-                        torch::jit::fuser::cuda::op_name)));                                  \
-        return output;                                                                        \
-      },                                                                                      \
-      py::return_value_policy::reference);                                                    \
-  nvf_ops.def(                                                                                \
-      op_str,                                                                                 \
-      [](nvfuser::FusionDefinition::Operators& self,                                          \
-         nvfuser::Tensor* arg1,                                                               \
-         nvfuser::Tensor* arg2,                                                               \
-         nvfuser::Tensor* arg3,                                                               \
-         nvfuser::Scalar* arg4) -> nvfuser::Tensor* {                                         \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();                     \
-        self.fusion_definition->defineRecord(new nvfuser::OpRecord<                           \
-                                             NvfTensorView*,                                  \
-                                             NvfTensorView*,                                  \
-                                             NvfTensorView*,                                  \
-                                             NvfTensorView*,                                  \
-                                             NvfTensorView*>(                                 \
-            {arg1->index, arg2->index, arg3->index, arg4->index},                             \
-            {output->index},                                                                  \
-            static_cast<                                                                      \
-                NvfTensorView* (*)(NvfTensorView*, NvfTensorView*, NvfTensorView*, NvfVal*)>( \
-                torch::jit::fuser::cuda::op_name)));                                          \
-        return output;                                                                        \
-      },                                                                                      \
-      py::return_value_policy::reference);                                                    \
-  nvf_ops.def(                                                                                \
-      op_str,                                                                                 \
-      [](nvfuser::FusionDefinition::Operators& self,                                          \
-         nvfuser::Tensor* arg1,                                                               \
-         nvfuser::Tensor* arg2,                                                               \
-         nvfuser::Scalar* arg3,                                                               \
-         nvfuser::Scalar* arg4) -> nvfuser::Tensor* {                                         \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();                     \
-        self.fusion_definition->defineRecord(new nvfuser::OpRecord<                           \
-                                             NvfTensorView*,                                  \
-                                             NvfTensorView*,                                  \
-                                             NvfTensorView*,                                  \
-                                             NvfVal*,                                         \
-                                             NvfVal*>(                                        \
-            {arg1->index, arg2->index, arg3->index, arg4->index},                             \
-            {output->index},                                                                  \
-            static_cast<                                                                      \
-                NvfTensorView* (*)(NvfTensorView*, NvfTensorView*, NvfVal*, NvfVal*)>(        \
-                torch::jit::fuser::cuda::op_name)));                                          \
-        return output;                                                                        \
-      },                                                                                      \
-      py::return_value_policy::reference);                                                    \
-  nvf_ops.def(                                                                                \
-      op_str,                                                                                 \
-      [](nvfuser::FusionDefinition::Operators& self,                                          \
-         nvfuser::Tensor* arg1,                                                               \
-         nvfuser::Scalar* arg2,                                                               \
-         nvfuser::Tensor* arg3,                                                               \
-         nvfuser::Scalar* arg4) -> nvfuser::Tensor* {                                         \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();                     \
-        self.fusion_definition->defineRecord(new nvfuser::OpRecord<                           \
-                                             NvfTensorView*,                                  \
-                                             NvfTensorView*,                                  \
-                                             NvfVal*,                                         \
-                                             NvfTensorView*,                                  \
-                                             NvfVal*>(                                        \
-            {arg1->index, arg2->index, arg3->index, arg4->index},                             \
-            {output->index},                                                                  \
-            static_cast<                                                                      \
-                NvfTensorView* (*)(NvfTensorView*, NvfVal*, NvfTensorView*, NvfVal*)>(        \
-                torch::jit::fuser::cuda::op_name)));                                          \
-        return output;                                                                        \
-      },                                                                                      \
-      py::return_value_policy::reference);                                                    \
-  nvf_ops.def(                                                                                \
-      op_str,                                                                                 \
-      [](nvfuser::FusionDefinition::Operators& self,                                          \
-         nvfuser::Scalar* arg1,                                                               \
-         nvfuser::Tensor* arg2,                                                               \
-         nvfuser::Tensor* arg3,                                                               \
-         nvfuser::Scalar* arg4) -> nvfuser::Tensor* {                                         \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();                     \
-        self.fusion_definition->defineRecord(new nvfuser::OpRecord<                           \
-                                             NvfTensorView*,                                  \
-                                             NvfVal*,                                         \
-                                             NvfTensorView*,                                  \
-                                             NvfTensorView*,                                  \
-                                             NvfVal*>(                                        \
-            {arg1->index, arg2->index, arg3->index, arg4->index},                             \
-            {output->index},                                                                  \
-            static_cast<                                                                      \
-                NvfTensorView* (*)(NvfVal*, NvfTensorView*, NvfTensorView*, NvfVal*)>(        \
-                torch::jit::fuser::cuda::op_name)));                                          \
-        return output;                                                                        \
-      },                                                                                      \
-      py::return_value_policy::reference);                                                    \
-  nvf_ops.def(                                                                                \
-      op_str,                                                                                 \
-      [](nvfuser::FusionDefinition::Operators& self,                                          \
-         nvfuser::Scalar* arg1,                                                               \
-         nvfuser::Scalar* arg2,                                                               \
-         nvfuser::Tensor* arg3,                                                               \
-         nvfuser::Scalar* arg4) -> nvfuser::Tensor* {                                         \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();                     \
-        self.fusion_definition->defineRecord(new nvfuser::OpRecord<                           \
-                                             NvfTensorView*,                                  \
-                                             NvfVal*,                                         \
-                                             NvfVal*,                                         \
-                                             NvfTensorView*,                                  \
-                                             NvfVal*>(                                        \
-            {arg1->index, arg2->index, arg3->index, arg4->index},                             \
-            {output->index},                                                                  \
-            static_cast<                                                                      \
-                NvfTensorView* (*)(NvfVal*, NvfVal*, NvfTensorView*, NvfVal*)>(               \
-                torch::jit::fuser::cuda::op_name)));                                          \
-        return output;                                                                        \
-      },                                                                                      \
-      py::return_value_policy::reference);                                                    \
-  nvf_ops.def(                                                                                \
-      op_str,                                                                                 \
-      [](nvfuser::FusionDefinition::Operators& self,                                          \
-         nvfuser::Tensor* arg1,                                                               \
-         nvfuser::Scalar* arg2,                                                               \
-         nvfuser::Scalar* arg3,                                                               \
-         nvfuser::Scalar* arg4) -> nvfuser::Tensor* {                                         \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();                     \
-        self.fusion_definition->defineRecord(new nvfuser::OpRecord<                           \
-                                             NvfTensorView*,                                  \
-                                             NvfTensorView*,                                  \
-                                             NvfVal*,                                         \
-                                             NvfVal*,                                         \
-                                             NvfVal*>(                                        \
-            {arg1->index, arg2->index, arg3->index, arg4->index},                             \
-            {output->index},                                                                  \
-            static_cast<                                                                      \
-                NvfTensorView* (*)(NvfTensorView*, NvfVal*, NvfVal*, NvfVal*)>(               \
-                torch::jit::fuser::cuda::op_name)));                                          \
-        return output;                                                                        \
-      },                                                                                      \
-      py::return_value_policy::reference);                                                    \
-  nvf_ops.def(                                                                                \
-      op_str,                                                                                 \
-      [](nvfuser::FusionDefinition::Operators& self,                                          \
-         nvfuser::Scalar* arg1,                                                               \
-         nvfuser::Tensor* arg2,                                                               \
-         nvfuser::Scalar* arg3,                                                               \
-         nvfuser::Scalar* arg4) -> nvfuser::Tensor* {                                         \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();                     \
-        self.fusion_definition->defineRecord(new nvfuser::OpRecord<                           \
-                                             NvfTensorView*,                                  \
-                                             NvfVal*,                                         \
-                                             NvfTensorView*,                                  \
-                                             NvfVal*,                                         \
-                                             NvfVal*>(                                        \
-            {arg1->index, arg2->index, arg3->index, arg4->index},                             \
-            {output->index},                                                                  \
-            static_cast<                                                                      \
-                NvfTensorView* (*)(NvfVal*, NvfTensorView*, NvfVal*, NvfVal*)>(               \
-                torch::jit::fuser::cuda::op_name)));                                          \
-        return output;                                                                        \
-      },                                                                                      \
+#define NVFUSER_PYTHON_BINDING_TERNARY_WITH_ALPHA_OP(op_str, op_name)                                  \
+  nvf_ops.def(                                                                                         \
+      op_str,                                                                                          \
+      [](nvfuser::FusionDefinition::Operators& self,                                                   \
+         nvfuser::Scalar arg1,                                                                         \
+         nvfuser::Scalar arg2,                                                                         \
+         nvfuser::Scalar arg3,                                                                         \
+         nvfuser::Scalar arg4) -> nvfuser::Scalar {                                                    \
+        FUSER_PERF_SCOPE("Operators." op_str);                                                         \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                                        \
+        nvfuser::Scalar output = fd->defineScalar();                                                   \
+        fd->defineRecord(new nvfuser::OpRecord<                                                        \
+                         Nvf::Val*,                                                                    \
+                         Nvf::Val*,                                                                    \
+                         Nvf::Val*,                                                                    \
+                         Nvf::Val*,                                                                    \
+                         Nvf::Val*>(                                                                   \
+            {fd->recordingState(arg1()),                                                               \
+             fd->recordingState(arg2()),                                                               \
+             fd->recordingState(arg3()),                                                               \
+             fd->recordingState(arg4())},                                                              \
+            {fd->recordingState(output())},                                                            \
+            ("ops." op_str),                                                                           \
+            static_cast<                                                                               \
+                Nvf::Val* (*)(Nvf::Val*, Nvf::Val*, Nvf::Val*, Nvf::Val*)>(                            \
+                Nvf::op_name)));                                                                       \
+        return output;                                                                                 \
+      },                                                                                               \
+      py::return_value_policy::reference);                                                             \
+  nvf_ops.def(                                                                                         \
+      op_str,                                                                                          \
+      [](nvfuser::FusionDefinition::Operators& self,                                                   \
+         nvfuser::Tensor arg1,                                                                         \
+         nvfuser::Tensor arg2,                                                                         \
+         nvfuser::Tensor arg3,                                                                         \
+         nvfuser::Scalar arg4) -> nvfuser::Tensor {                                                    \
+        FUSER_PERF_SCOPE("Operators." op_str);                                                         \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                                        \
+        nvfuser::Tensor output = fd->defineTensor();                                                   \
+        fd->defineRecord(new nvfuser::OpRecord<                                                        \
+                         Nvf::TensorView*,                                                             \
+                         Nvf::TensorView*,                                                             \
+                         Nvf::TensorView*,                                                             \
+                         Nvf::TensorView*,                                                             \
+                         Nvf::TensorView*>(                                                            \
+            {fd->recordingState(arg1()),                                                               \
+             fd->recordingState(arg2()),                                                               \
+             fd->recordingState(arg3()),                                                               \
+             fd->recordingState(arg4())},                                                              \
+            {fd->recordingState(output())},                                                            \
+            ("ops." op_str),                                                                           \
+            static_cast<                                                                               \
+                Nvf::                                                                                  \
+                    TensorView* (*)(Nvf::TensorView*, Nvf::TensorView*, Nvf::TensorView*, Nvf::Val*)>( \
+                Nvf::op_name)));                                                                       \
+        return output;                                                                                 \
+      },                                                                                               \
+      py::return_value_policy::reference);                                                             \
+  nvf_ops.def(                                                                                         \
+      op_str,                                                                                          \
+      [](nvfuser::FusionDefinition::Operators& self,                                                   \
+         nvfuser::Tensor arg1,                                                                         \
+         nvfuser::Tensor arg2,                                                                         \
+         nvfuser::Scalar arg3,                                                                         \
+         nvfuser::Scalar arg4) -> nvfuser::Tensor {                                                    \
+        FUSER_PERF_SCOPE("Operators." op_str);                                                         \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                                        \
+        nvfuser::Tensor output = fd->defineTensor();                                                   \
+        fd->defineRecord(new nvfuser::OpRecord<                                                        \
+                         Nvf::TensorView*,                                                             \
+                         Nvf::TensorView*,                                                             \
+                         Nvf::TensorView*,                                                             \
+                         Nvf::Val*,                                                                    \
+                         Nvf::Val*>(                                                                   \
+            {fd->recordingState(arg1()),                                                               \
+             fd->recordingState(arg2()),                                                               \
+             fd->recordingState(arg3()),                                                               \
+             fd->recordingState(arg4())},                                                              \
+            {fd->recordingState(output())},                                                            \
+            ("ops." op_str),                                                                           \
+            static_cast<                                                                               \
+                Nvf::                                                                                  \
+                    TensorView* (*)(Nvf::TensorView*, Nvf::TensorView*, Nvf::Val*, Nvf::Val*)>(        \
+                Nvf::op_name)));                                                                       \
+        return output;                                                                                 \
+      },                                                                                               \
+      py::return_value_policy::reference);                                                             \
+  nvf_ops.def(                                                                                         \
+      op_str,                                                                                          \
+      [](nvfuser::FusionDefinition::Operators& self,                                                   \
+         nvfuser::Tensor arg1,                                                                         \
+         nvfuser::Scalar arg2,                                                                         \
+         nvfuser::Tensor arg3,                                                                         \
+         nvfuser::Scalar arg4) -> nvfuser::Tensor {                                                    \
+        FUSER_PERF_SCOPE("Operators." op_str);                                                         \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                                        \
+        nvfuser::Tensor output = fd->defineTensor();                                                   \
+        fd->defineRecord(new nvfuser::OpRecord<                                                        \
+                         Nvf::TensorView*,                                                             \
+                         Nvf::TensorView*,                                                             \
+                         Nvf::Val*,                                                                    \
+                         Nvf::TensorView*,                                                             \
+                         Nvf::Val*>(                                                                   \
+            {fd->recordingState(arg1()),                                                               \
+             fd->recordingState(arg2()),                                                               \
+             fd->recordingState(arg3()),                                                               \
+             fd->recordingState(arg4())},                                                              \
+            {fd->recordingState(output())},                                                            \
+            ("ops." op_str),                                                                           \
+            static_cast<                                                                               \
+                Nvf::                                                                                  \
+                    TensorView* (*)(Nvf::TensorView*, Nvf::Val*, Nvf::TensorView*, Nvf::Val*)>(        \
+                Nvf::op_name)));                                                                       \
+        return output;                                                                                 \
+      },                                                                                               \
+      py::return_value_policy::reference);                                                             \
+  nvf_ops.def(                                                                                         \
+      op_str,                                                                                          \
+      [](nvfuser::FusionDefinition::Operators& self,                                                   \
+         nvfuser::Scalar arg1,                                                                         \
+         nvfuser::Tensor arg2,                                                                         \
+         nvfuser::Tensor arg3,                                                                         \
+         nvfuser::Scalar arg4) -> nvfuser::Tensor {                                                    \
+        FUSER_PERF_SCOPE("Operators." op_str);                                                         \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                                        \
+        nvfuser::Tensor output = fd->defineTensor();                                                   \
+        fd->defineRecord(new nvfuser::OpRecord<                                                        \
+                         Nvf::TensorView*,                                                             \
+                         Nvf::Val*,                                                                    \
+                         Nvf::TensorView*,                                                             \
+                         Nvf::TensorView*,                                                             \
+                         Nvf::Val*>(                                                                   \
+            {fd->recordingState(arg1()),                                                               \
+             fd->recordingState(arg2()),                                                               \
+             fd->recordingState(arg3()),                                                               \
+             fd->recordingState(arg4())},                                                              \
+            {fd->recordingState(output())},                                                            \
+            ("ops." op_str),                                                                           \
+            static_cast<                                                                               \
+                Nvf::                                                                                  \
+                    TensorView* (*)(Nvf::Val*, Nvf::TensorView*, Nvf::TensorView*, Nvf::Val*)>(        \
+                Nvf::op_name)));                                                                       \
+        return output;                                                                                 \
+      },                                                                                               \
+      py::return_value_policy::reference);                                                             \
+  nvf_ops.def(                                                                                         \
+      op_str,                                                                                          \
+      [](nvfuser::FusionDefinition::Operators& self,                                                   \
+         nvfuser::Scalar arg1,                                                                         \
+         nvfuser::Scalar arg2,                                                                         \
+         nvfuser::Tensor arg3,                                                                         \
+         nvfuser::Scalar arg4) -> nvfuser::Tensor {                                                    \
+        FUSER_PERF_SCOPE("Operators." op_str);                                                         \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                                        \
+        nvfuser::Tensor output = fd->defineTensor();                                                   \
+        fd->defineRecord(new nvfuser::OpRecord<                                                        \
+                         Nvf::TensorView*,                                                             \
+                         Nvf::Val*,                                                                    \
+                         Nvf::Val*,                                                                    \
+                         Nvf::TensorView*,                                                             \
+                         Nvf::Val*>(                                                                   \
+            {fd->recordingState(arg1()),                                                               \
+             fd->recordingState(arg2()),                                                               \
+             fd->recordingState(arg3()),                                                               \
+             fd->recordingState(arg4())},                                                              \
+            {fd->recordingState(output())},                                                            \
+            ("ops." op_str),                                                                           \
+            static_cast<                                                                               \
+                Nvf::                                                                                  \
+                    TensorView* (*)(Nvf::Val*, Nvf::Val*, Nvf::TensorView*, Nvf::Val*)>(               \
+                Nvf::op_name)));                                                                       \
+        return output;                                                                                 \
+      },                                                                                               \
+      py::return_value_policy::reference);                                                             \
+  nvf_ops.def(                                                                                         \
+      op_str,                                                                                          \
+      [](nvfuser::FusionDefinition::Operators& self,                                                   \
+         nvfuser::Tensor arg1,                                                                         \
+         nvfuser::Scalar arg2,                                                                         \
+         nvfuser::Scalar arg3,                                                                         \
+         nvfuser::Scalar arg4) -> nvfuser::Tensor {                                                    \
+        FUSER_PERF_SCOPE("Operators." op_str);                                                         \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                                        \
+        nvfuser::Tensor output = fd->defineTensor();                                                   \
+        fd->defineRecord(new nvfuser::OpRecord<                                                        \
+                         Nvf::TensorView*,                                                             \
+                         Nvf::TensorView*,                                                             \
+                         Nvf::Val*,                                                                    \
+                         Nvf::Val*,                                                                    \
+                         Nvf::Val*>(                                                                   \
+            {fd->recordingState(arg1()),                                                               \
+             fd->recordingState(arg2()),                                                               \
+             fd->recordingState(arg3()),                                                               \
+             fd->recordingState(arg4())},                                                              \
+            {fd->recordingState(output())},                                                            \
+            ("ops." op_str),                                                                           \
+            static_cast<                                                                               \
+                Nvf::                                                                                  \
+                    TensorView* (*)(Nvf::TensorView*, Nvf::Val*, Nvf::Val*, Nvf::Val*)>(               \
+                Nvf::op_name)));                                                                       \
+        return output;                                                                                 \
+      },                                                                                               \
+      py::return_value_policy::reference);                                                             \
+  nvf_ops.def(                                                                                         \
+      op_str,                                                                                          \
+      [](nvfuser::FusionDefinition::Operators& self,                                                   \
+         nvfuser::Scalar arg1,                                                                         \
+         nvfuser::Tensor arg2,                                                                         \
+         nvfuser::Scalar arg3,                                                                         \
+         nvfuser::Scalar arg4) -> nvfuser::Tensor {                                                    \
+        FUSER_PERF_SCOPE("Operators." op_str);                                                         \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                                        \
+        nvfuser::Tensor output = fd->defineTensor();                                                   \
+        fd->defineRecord(new nvfuser::OpRecord<                                                        \
+                         Nvf::TensorView*,                                                             \
+                         Nvf::Val*,                                                                    \
+                         Nvf::TensorView*,                                                             \
+                         Nvf::Val*,                                                                    \
+                         Nvf::Val*>(                                                                   \
+            {fd->recordingState(arg1()),                                                               \
+             fd->recordingState(arg2()),                                                               \
+             fd->recordingState(arg3()),                                                               \
+             fd->recordingState(arg4())},                                                              \
+            {fd->recordingState(output())},                                                            \
+            ("ops." op_str),                                                                           \
+            static_cast<                                                                               \
+                Nvf::                                                                                  \
+                    TensorView* (*)(Nvf::Val*, Nvf::TensorView*, Nvf::Val*, Nvf::Val*)>(               \
+                Nvf::op_name)));                                                                       \
+        return output;                                                                                 \
+      },                                                                                               \
       py::return_value_policy::reference);
 
   NVFUSER_PYTHON_BINDING_TERNARY_WITH_ALPHA_OP("addcmul", addcmul)
 #undef NVFUSER_PYTHON_BINDING_TERNARY_WITH_ALPHA_OP
 
-#define NVFUSER_PYTHON_BINDING_REDUCTION_OP(op_str, op_name)                 \
-  nvf_ops.def(                                                               \
-      op_str,                                                                \
-      [](nvfuser::FusionDefinition::Operators& self,                         \
-         nvfuser::Tensor* arg,                                               \
-         const std::vector<int>& axes,                                       \
-         bool keep_dim,                                                      \
-         NvfDataType dtype) -> nvfuser::Tensor* {                            \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();    \
-        self.fusion_definition->defineRecord(new nvfuser::ReductionOpRecord( \
-            {arg->index},                                                    \
-            {output->index},                                                 \
-            torch::jit::fuser::cuda::op_name,                                \
-            axes,                                                            \
-            keep_dim,                                                        \
-            dtype));                                                         \
-        return output;                                                       \
-      },                                                                     \
-      py::arg("arg"),                                                        \
-      py::arg("axes"),                                                       \
-      py::arg("keep_dim"),                                                   \
-      py::arg("dtype") = torch::jit::fuser::cuda::DataType::Null,            \
+#define NVFUSER_PYTHON_BINDING_REDUCTION_OP(op_str, op_name)                                          \
+  nvf_ops.def(                                                                                        \
+      op_str,                                                                                         \
+      [](nvfuser::FusionDefinition::Operators& self,                                                  \
+         nvfuser::Tensor arg,                                                                         \
+         const std::vector<int>& axes,                                                                \
+         bool keepdim,                                                                                \
+         Nvf::DataType dtype) -> nvfuser::Tensor {                                                    \
+        FUSER_PERF_SCOPE("Operators." op_str);                                                        \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;                                       \
+        nvfuser::Tensor output = fd->defineTensor();                                                  \
+        fd->defineRecord(new nvfuser::ReductionOpRecord(                                              \
+            {fd->recordingState(arg())},                                                              \
+            {fd->recordingState(output())},                                                           \
+            ("ops." op_str),                                                                          \
+            static_cast<                                                                              \
+                Nvf::                                                                                 \
+                    TensorView* (*)(Nvf::TensorView*, const std::vector<int>&, bool, Nvf::DataType)>( \
+                Nvf::op_name),                                                                        \
+            axes,                                                                                     \
+            keepdim,                                                                                  \
+            dtype));                                                                                  \
+        return output;                                                                                \
+      },                                                                                              \
+      py::arg("arg"),                                                                                 \
+      py::arg("axes"),                                                                                \
+      py::arg("keepdim") = false,                                                                     \
+      py::arg("dtype") = Nvf::DataType::Null,                                                         \
       py::return_value_policy::reference);
 
   NVFUSER_PYTHON_BINDING_REDUCTION_OP("sum", sum)
@@ -884,38 +1130,48 @@ void initNvFuserPythonBindings(PyObject* module) {
   NVFUSER_PYTHON_BINDING_REDUCTION_OP("min", min)
 #undef NVFUSER_PYTHON_BINDING_REDUCTION_OP
 
-#define NVFUSER_PYTHON_BINDING_CAST_OP(op_str, op_name)                       \
-  nvf_ops.def(                                                                \
-      op_str,                                                                 \
-      [](nvfuser::FusionDefinition::Operators& self,                          \
-         nvfuser::Tensor* arg,                                                \
-         NvfDataType dtype) -> nvfuser::Tensor* {                             \
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();     \
-        self.fusion_definition->defineRecord(                                 \
-            new nvfuser::CastOpRecord<NvfTensorView*, NvfTensorView*>(        \
-                {arg->index},                                                 \
-                {output->index},                                              \
-                static_cast<NvfTensorView* (*)(NvfDataType, NvfTensorView*)>( \
-                    torch::jit::fuser::cuda::op_name),                        \
-                dtype));                                                      \
-        return output;                                                        \
-      },                                                                      \
-      py::return_value_policy::reference);                                    \
-  nvf_ops.def(                                                                \
-      op_str,                                                                 \
-      [](nvfuser::FusionDefinition::Operators& self,                          \
-         nvfuser::Scalar* arg,                                                \
-         NvfDataType dtype) -> nvfuser::Scalar* {                             \
-        nvfuser::Scalar* output = self.fusion_definition->defineScalar();     \
-        self.fusion_definition->defineRecord(                                 \
-            new nvfuser::CastOpRecord<NvfVal*, NvfVal*>(                      \
-                {arg->index},                                                 \
-                {output->index},                                              \
-                static_cast<NvfVal* (*)(NvfDataType, NvfVal*)>(               \
-                    torch::jit::fuser::cuda::op_name),                        \
-                dtype));                                                      \
-        return output;                                                        \
-      },                                                                      \
+#define NVFUSER_PYTHON_BINDING_CAST_OP(op_str, op_name)                     \
+  nvf_ops.def(                                                              \
+      op_str,                                                               \
+      [](nvfuser::FusionDefinition::Operators& self,                        \
+         nvfuser::Tensor arg,                                               \
+         Nvf::DataType dtype) -> nvfuser::Tensor {                          \
+        FUSER_PERF_SCOPE("Operators." op_str);                              \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;             \
+        nvfuser::Tensor output = fd->defineTensor();                        \
+        fd->defineRecord(                                                   \
+            new nvfuser::CastOpRecord<Nvf::TensorView*, Nvf::TensorView*>(  \
+                {fd->recordingState(arg())},                                \
+                {fd->recordingState(output())},                             \
+                ("ops." op_str),                                            \
+                static_cast<                                                \
+                    Nvf::TensorView* (*)(Nvf::DataType, Nvf::TensorView*)>( \
+                    Nvf::op_name),                                          \
+                dtype));                                                    \
+        return output;                                                      \
+      },                                                                    \
+      py::arg("arg"),                                                       \
+      py::arg("dtype"),                                                     \
+      py::return_value_policy::reference);                                  \
+  nvf_ops.def(                                                              \
+      op_str,                                                               \
+      [](nvfuser::FusionDefinition::Operators& self,                        \
+         nvfuser::Scalar arg,                                               \
+         Nvf::DataType dtype) -> nvfuser::Scalar {                          \
+        FUSER_PERF_SCOPE("Operators." op_str);                              \
+        nvfuser::FusionDefinition* fd = self.fusion_definition;             \
+        nvfuser::Scalar output = fd->defineScalar();                        \
+        fd->defineRecord(new nvfuser::CastOpRecord<Nvf::Val*, Nvf::Val*>(   \
+            {fd->recordingState(arg())},                                    \
+            {fd->recordingState(output())},                                 \
+            ("ops." op_str),                                                \
+            static_cast<Nvf::Val* (*)(Nvf::DataType, Nvf::Val*)>(           \
+                Nvf::op_name),                                              \
+            dtype));                                                        \
+        return output;                                                      \
+      },                                                                    \
+      py::arg("arg"),                                                       \
+      py::arg("dtype"),                                                     \
       py::return_value_policy::reference);
 
   NVFUSER_PYTHON_BINDING_CAST_OP("cast", castOp)
@@ -924,60 +1180,93 @@ void initNvFuserPythonBindings(PyObject* module) {
   nvf_ops.def(
       "squeeze",
       [](nvfuser::FusionDefinition::Operators& self,
-         nvfuser::Tensor* arg,
+         nvfuser::Tensor arg,
          std::vector<int64_t>& original_shape,
-         int64_t dim) -> nvfuser::Tensor* {
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();
-        self.fusion_definition->defineRecord(new nvfuser::SqueezeOpRecord(
-            {arg->index}, {output->index}, original_shape, dim));
+         int64_t dim) -> nvfuser::Tensor {
+        FUSER_PERF_SCOPE("Operators.squeeze");
+        nvfuser::FusionDefinition* fd = self.fusion_definition;
+        nvfuser::Tensor output = fd->defineTensor();
+        fd->defineRecord(new nvfuser::SqueezeOpRecord(
+            {fd->recordingState(arg())},
+            {fd->recordingState(output())},
+            original_shape,
+            dim));
         return output;
       },
+      py::arg("arg"),
+      py::arg("original_shape"),
+      py::arg("dim"),
       py::return_value_policy::reference);
-
   nvf_ops.def(
       "var",
       [](nvfuser::FusionDefinition::Operators& self,
-         nvfuser::Tensor* arg,
+         nvfuser::Tensor arg,
          std::vector<int>& axes,
          int64_t correction,
-         bool keepdim) -> nvfuser::Tensor* {
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();
-        self.fusion_definition->defineRecord(new nvfuser::VarianceOpRecord(
-            {arg->index}, {output->index}, axes, correction, keepdim));
+         bool keepdim) -> nvfuser::Tensor {
+        FUSER_PERF_SCOPE("Operators.var");
+        nvfuser::FusionDefinition* fd = self.fusion_definition;
+        nvfuser::Tensor output = fd->defineTensor();
+        fd->defineRecord(new nvfuser::VarianceOpRecord(
+            {fd->recordingState(arg())},
+            {fd->recordingState(output())},
+            axes,
+            correction,
+            keepdim));
         return output;
       },
+      py::arg("arg"),
+      py::arg("axes"),
+      py::arg("correction"),
+      py::arg("keepdim") = false,
       py::return_value_policy::reference);
-
   nvf_ops.def(
       "var_mean",
       [](nvfuser::FusionDefinition::Operators& self,
-         nvfuser::Tensor* arg,
-         std::vector<int>& dims,
+         nvfuser::Tensor arg,
+         std::vector<int>& axes,
          int64_t correction,
          bool keepdim) -> decltype(auto) {
-        nvfuser::Tensor* var = self.fusion_definition->defineTensor();
-        nvfuser::Tensor* mean = self.fusion_definition->defineTensor();
-        self.fusion_definition->defineRecord(new nvfuser::VarianceMeanOpRecord(
-            {arg->index},
-            {var->index, mean->index},
-            dims,
+        FUSER_PERF_SCOPE("Operators.var_mean");
+        nvfuser::FusionDefinition* fd = self.fusion_definition;
+        nvfuser::Tensor var = fd->defineTensor();
+        nvfuser::Tensor mean = fd->defineTensor();
+        fd->defineRecord(new nvfuser::VarianceMeanOpRecord(
+            {fd->recordingState(arg())},
+            {fd->recordingState(var()), fd->recordingState(mean())},
+            axes,
             correction,
             keepdim));
         return std::make_tuple(var, mean);
       },
+      py::arg("arg"),
+      py::arg("axes"),
+      py::arg("correction"),
+      py::arg("keepdim") = false,
       py::return_value_policy::reference);
-
   nvf_ops.def(
       "broadcast_in_dim",
       [](nvfuser::FusionDefinition::Operators& self,
-         nvfuser::Tensor* arg,
+         nvfuser::Tensor arg,
          std::vector<int64_t>& output_shape,
-         std::vector<int64_t>& broadcast_dims) -> nvfuser::Tensor* {
-        nvfuser::Tensor* output = self.fusion_definition->defineTensor();
-        self.fusion_definition->defineRecord(new nvfuser::BroadcastOpRecord(
-            {arg->index}, {output->index}, output_shape, broadcast_dims));
+         std::vector<int64_t>& broadcast_dims) -> nvfuser::Tensor {
+        FUSER_PERF_SCOPE("Operators.broadcast_in_dim");
+        nvfuser::FusionDefinition* fd = self.fusion_definition;
+        TORCH_CHECK(
+            output_shape.size() >= broadcast_dims.size(),
+            "broadcast_dims vector size is too big for output shape!");
+        nvfuser::Tensor output = fd->defineTensor();
+        fd->defineRecord(new nvfuser::BroadcastOpRecord(
+            {fd->recordingState(arg())},
+            {fd->recordingState(output())},
+            "ops.broadcast_in_dim",
+            output_shape,
+            broadcast_dims));
         return output;
       },
+      py::arg("arg"),
+      py::arg("output_shape"),
+      py::arg("broadcast_dims"),
       py::return_value_policy::reference);
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_cache.cpp b/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_cache.cpp
new file mode 100644
index 000000000000..29040b1641f8
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_cache.cpp
@@ -0,0 +1,257 @@
+#if defined(USE_CUDA)
+#include <gmock/gmock-matchers.h>
+#include <gtest/gtest.h>
+
+#include <torch/torch.h>
+
+#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
+
+// Tests go in torch::jit
+namespace torch {
+namespace jit {
+
+using namespace nvfuser;
+using namespace torch::jit::fuser::cuda;
+
+// RUN CMD: bin/test_jit --gtest_filter="NVFuserTest*PyFusionCache*"
+TEST_F(NVFuserTest, PyFusionCache_CUDA) {
+  // Create a fusion manager with a maximum of 1 Fusion
+  FusionCache* fc = FusionCache::get(1);
+
+  // You should never get a nullptr
+  ASSERT_FALSE(fc == nullptr);
+
+  // Check that cache methods all assert when presented with a null record.
+  {
+    std::unique_ptr<RecordFunctor> null_record(nullptr);
+
+    try {
+      auto bad_cache_entry_ptr = fc->lookupFusionCacheEntry(null_record.get());
+      FAIL() << "Should trigger an assert when the record is looked up!";
+    } catch (...) {
+      SUCCEED();
+    }
+
+    try {
+      fc->traverseFusionCache(null_record.get());
+      FAIL() << "Should trigger an assert when the record is looked up!";
+    } catch (...) {
+      SUCCEED();
+    }
+
+    try {
+      fc->createFusionCacheEntry(null_record.get());
+      FAIL() << "Should trigger an assert when the record is looked up!";
+    } catch (...) {
+      SUCCEED();
+    }
+
+    try {
+      auto id = fc->createFusionCacheEntry(null_record.get());
+      FAIL() << "Should trigger an assert when the record is looked up!";
+    } catch (...) {
+      SUCCEED();
+    }
+  }
+
+  // Check that cache methods act appropriately when presenting a new
+  // record to an empty cache.
+  {
+    std::unique_ptr<RecordFunctor> test_record(new TensorRecord(
+        {State(0, StateType::Tensor)}, {3}, {true}, Nvf::DataType::Float));
+
+    // Check Methods prior to adding an entry to the cache
+
+    // Cache Lookup should not succeed becase no records are in the cache
+    try {
+      auto empty_cache_entry_ptr =
+          fc->lookupFusionCacheEntry(test_record.get());
+      ASSERT_TRUE(empty_cache_entry_ptr == c10::nullopt);
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "Unexpected assert during cache lookup!" << e.what();
+    }
+
+    // Traversal of the cache should fail because there is nothing to traverse
+    try {
+      fc->traverseFusionCache(test_record.get());
+      FAIL() << "Expected the cache traversal to fail!";
+    } catch (...) {
+      SUCCEED();
+    }
+
+    // Add a cache entry and check methods
+
+    try {
+      fc->createFusionCacheEntry(test_record.get());
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "An unexpected assert on Cache Entry creation!" << e.what();
+    }
+
+    try {
+      auto cache_entry_ptr = fc->lookupFusionCacheEntry(test_record.get());
+      ASSERT_FALSE(cache_entry_ptr == c10::nullopt);
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "An unexpected assert on cache lookup!" << e.what();
+    }
+
+    try {
+      fc->traverseFusionCache(test_record.get());
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "An unexpected assert during Cache Traverse!" << e.what();
+    }
+
+    // Add a terminal cache entry and check methods
+
+    std::unique_ptr<RecordFunctor> end_record(new EndRecord());
+    try {
+      auto id = fc->createFusionCacheEntry(end_record.get());
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "An unexpected assert on Terminal Cache Entry creation!"
+             << e.what();
+    }
+
+    try {
+      fc->traverseFusionCache(end_record.get());
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "An unexpected assert while traversing to a Terminal Entry!"
+             << e.what();
+    }
+
+    try {
+      auto no_cache_entry_ptr = fc->lookupFusionCacheEntry(test_record.get());
+      FAIL() << "Expected an assert from a terminal entry!";
+    } catch (...) {
+      SUCCEED();
+    }
+
+    try {
+      fc->traverseFusionCache(test_record.get());
+      FAIL() << "Expected an assert from a terminal entry!";
+    } catch (...) {
+      SUCCEED();
+    }
+  }
+
+  // Setup cache for a new cache lookup
+  try {
+    fc->resetFusionCachePtr();
+    SUCCEED();
+  } catch (const std::exception& e) {
+    FAIL() << "Did not properly set cache to pointer to top of tree!"
+           << e.what();
+  }
+
+  // Check that cache methods act appropriately when presenting a new
+  // record to a cache with 1 fusion.
+  {
+    std::unique_ptr<RecordFunctor> cached_record(new TensorRecord(
+        {State(0, StateType::Tensor)}, {3}, {true}, Nvf::DataType::Float));
+    std::unique_ptr<RecordFunctor> new_record(
+        new ScalarRecord({State(1, StateType::Scalar)}, Nvf::DataType::Float));
+
+    try {
+      auto hit_cache_entry = fc->lookupFusionCacheEntry(cached_record.get());
+      ASSERT_FALSE(hit_cache_entry == c10::nullopt);
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "Cache lookup unexpectedly asserted!" << e.what();
+    }
+
+    try {
+      fc->traverseFusionCache(cached_record.get());
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "Fusion cache traverse unexpectedly asserted!" << e.what();
+    }
+
+    try {
+      auto miss_cache_entry = fc->lookupFusionCacheEntry(new_record.get());
+      ASSERT_TRUE(miss_cache_entry == c10::nullopt);
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "Cache lookup unexpectedly asserted!" << e.what();
+    }
+
+    try {
+      fc->createFusionCacheEntry(new_record.get());
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "An unexpected assert on Cache Entry creation!" << e.what();
+    }
+
+    try {
+      fc->traverseFusionCache(new_record.get());
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "Fusion cache traverse unexpectedly asserted!" << e.what();
+    }
+
+    std::unique_ptr<RecordFunctor> end_record(new EndRecord());
+    try {
+      auto id = fc->createFusionCacheEntry(end_record.get());
+      FAIL() << "Expected the cache to assert because it is full!";
+    } catch (...) {
+      SUCCEED();
+    }
+  }
+
+  // Setup cache for a new cache lookup
+  try {
+    fc->resetFusionCachePtr();
+    SUCCEED();
+  } catch (const std::exception& e) {
+    FAIL() << "Did not properly set cache to pointer to top of tree!"
+           << e.what();
+  }
+
+  // Verify proper cache lookup up of complete fusion already cached.
+  // This tends to flush out pointer problems in the cache.
+  {
+    std::unique_ptr<RecordFunctor> test_record(new TensorRecord(
+        {State(0, StateType::Tensor)}, {3}, {true}, Nvf::DataType::Float));
+    std::unique_ptr<RecordFunctor> dummy_record(new TensorRecord(
+        {State(0, StateType::Tensor)}, {3}, {true}, Nvf::DataType::Float));
+
+    try {
+      auto cache_entry_ptr = fc->lookupFusionCacheEntry(test_record.get());
+      ASSERT_FALSE(cache_entry_ptr == c10::nullopt);
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "An unexpected assert on cache lookup!" << e.what();
+    }
+
+    try {
+      fc->traverseFusionCache(test_record.get());
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "An unexpected assert during Cache Traverse!" << e.what();
+    }
+
+    std::unique_ptr<RecordFunctor> end_record(new EndRecord());
+    try {
+      auto no_cache_entry_ptr = fc->lookupFusionCacheEntry(end_record.get());
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "An unexpected assert on cache lookup!" << e.what();
+    }
+
+    try {
+      fc->traverseFusionCache(end_record.get());
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "An unexpected assert while traversing to a Terminal Entry!"
+             << e.what();
+    }
+  }
+}
+
+} // namespace jit
+} // namespace torch
+#endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_definition.cpp b/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_definition.cpp
new file mode 100644
index 000000000000..84aa4da5909a
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_definition.cpp
@@ -0,0 +1,195 @@
+#if defined(USE_CUDA)
+#include <gmock/gmock-matchers.h>
+#include <gtest/gtest.h>
+
+#include <torch/torch.h>
+
+#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.h>
+#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.h>
+#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
+
+// Tests go in torch::jit
+namespace torch {
+namespace jit {
+
+using namespace nvfuser;
+using namespace torch::jit::fuser::cuda;
+
+// RUN CMD: bin/test_jit --gtest_filter="NVFuserTest*FusionDefinition*"
+TEST_F(NVFuserTest, FusionDefinition_CUDA) {
+  // Test that the FusionDefinition asserts on max_length == 0
+  {
+    FusionDefinition fd(nullptr, 0);
+
+    try {
+      fd.enter();
+      FAIL() << "You should trigger an assert with 0 Records allowed!";
+    } catch (...) {
+      SUCCEED();
+    }
+  }
+
+  // Test that the FusionDefinition asserts on a null FusionManager ptr
+  {
+    FusionDefinition fd(nullptr, 5);
+
+    try {
+      fd.enter();
+      FAIL() << "You should trigger an assert with a null FusionInterface!";
+    } catch (...) {
+      SUCCEED();
+    }
+  }
+
+  // Create a new FusionDefinition that is not found in the cache
+  {
+    std::unique_ptr<FusionInterface> fusion =
+        std::make_unique<FusionInterface>();
+    FusionDefinition fd(fusion.get(), 4);
+
+    try {
+      fd.enter();
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "Unexpected assert while entering FusionDefinition context! "
+             << e.what();
+    }
+
+    auto t0 = fd.defineTensor();
+    try {
+      fd.defineRecord(new TensorRecord(
+          {fd.recordingState(t0())}, {3}, {true}, Nvf::DataType::Float));
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "Unexpected assert during Tensor Record creation! " << e.what();
+    }
+
+    auto s1 = fd.defineScalar();
+    try {
+      fd.defineRecord(
+          new ScalarRecord({fd.recordingState(s1())}, Nvf::DataType::Double));
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "Unexpected assert during Scalar Record creation! " << e.what();
+    }
+
+    auto t2 = fd.defineTensor();
+    try {
+      fd.defineRecord(
+          new OpRecord<Nvf::TensorView*, Nvf::TensorView*, Nvf::Val*>(
+              {fd.recordingState(t0()), fd.recordingState(s1())},
+              {fd.recordingState(t2())},
+              "ops.add",
+              static_cast<Nvf::TensorView* (*)(Nvf::TensorView*, Nvf::Val*)>(
+                  Nvf::add)));
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "Unexpected assert during Add Record creation! " << e.what();
+    }
+
+    try {
+      fd.defineRecord(
+          new OutputRecord<Nvf::TensorView>({fd.recordingState(t2())}));
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "Unexpected assert during Output Record creation! " << e.what();
+    }
+
+    try {
+      fd.defineRecord(new OutputRecord<Nvf::Val>({fd.recordingState(s1())}));
+      FAIL() << "Expected an assert for too many records!";
+    } catch (...) {
+      SUCCEED();
+    }
+
+    try {
+      fd.exit();
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "Unexpected assert during creation of a new Fusion! "
+             << e.what();
+    }
+  }
+
+  // Look up a FusionDefinition with a defined Fusion
+  {
+    std::unique_ptr<FusionInterface> fusion =
+        std::make_unique<FusionInterface>(0);
+    FusionDefinition fd(fusion.get(), 1);
+
+    try {
+      fd.enter();
+      FAIL() << "You should trigger an assert with a defined FusionInterface!";
+    } catch (const std::exception& e) {
+      SUCCEED();
+    }
+  }
+
+  // Look up a FusionDefinition completely in the cache
+  {
+    std::unique_ptr<FusionInterface> fusion =
+        std::make_unique<FusionInterface>();
+    FusionDefinition fd(fusion.get(), 4);
+
+    try {
+      fd.enter();
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "Unexpected assert while entering FusionDefinition context! "
+             << e.what();
+    }
+
+    auto t0 = fd.defineTensor();
+    try {
+      fd.defineRecord(new TensorRecord(
+          {fd.recordingState(t0())}, {3}, {true}, Nvf::DataType::Float));
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "Unexpected assert during Tensor Record creation! " << e.what();
+    }
+
+    auto s1 = fd.defineScalar();
+    try {
+      fd.defineRecord(
+          new ScalarRecord({fd.recordingState(s1())}, Nvf::DataType::Double));
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "Unexpected assert during Scalar Record creation! " << e.what();
+    }
+
+    auto t2 = fd.defineTensor();
+    try {
+      fd.defineRecord(
+          new OpRecord<Nvf::TensorView*, Nvf::TensorView*, Nvf::Val*>(
+              {fd.recordingState(t0()), fd.recordingState(s1())},
+              {fd.recordingState(t2())},
+              "ops.add",
+              static_cast<Nvf::TensorView* (*)(Nvf::TensorView*, Nvf::Val*)>(
+                  Nvf::add)));
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "Unexpected assert during Add Record creation! " << e.what();
+    }
+
+    try {
+      fd.defineRecord(
+          new OutputRecord<Nvf::TensorView>({fd.recordingState(t2())}));
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "Unexpected assert during Output Record creation! " << e.what();
+    }
+
+    try {
+      fd.exit();
+      SUCCEED();
+    } catch (const std::exception& e) {
+      FAIL() << "Unexpected assert during creation of a new Fusion! "
+             << e.what();
+    }
+  }
+}
+
+} // namespace jit
+} // namespace torch
+#endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_record.cpp b/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_record.cpp
new file mode 100644
index 000000000000..47785156ef78
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_record.cpp
@@ -0,0 +1,135 @@
+#if defined(USE_CUDA)
+#include <gmock/gmock-matchers.h>
+#include <gtest/gtest.h>
+
+#include <torch/torch.h>
+
+#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
+
+// Tests go in torch::jit
+namespace torch {
+namespace jit {
+
+using namespace nvfuser;
+using namespace torch::jit::fuser::cuda;
+
+// RUN CMD: bin/test_jit --gtest_filter="NVFuserTest*RecordFunctorEquality*"
+TEST_F(NVFuserTest, RecordFunctorEquality_CUDA) {
+  // Getting the std::function matching correct is error prone so providing
+  // checks for OpRecord, CastOp, and ReductionOp that employ std::function
+  // matching.
+
+  // OpRecord Equality Check
+  {
+    auto t0 = nvfuser::State(0, StateType::Tensor);
+    auto s1 = nvfuser::State(1, StateType::Scalar);
+    auto out = nvfuser::State(2, StateType::Tensor);
+    std::unique_ptr<RecordFunctor> test_record1(
+        new OpRecord<Nvf::TensorView*, Nvf::TensorView*, Nvf::Val*>(
+            {t0, s1},
+            {out},
+            "ops.mul",
+            static_cast<Nvf::TensorView* (*)(Nvf::TensorView*, Nvf::Val*)>(
+                Nvf::mul)));
+    std::unique_ptr<RecordFunctor> test_record2(
+        new OpRecord<Nvf::TensorView*, Nvf::TensorView*, Nvf::Val*>(
+            {t0, s1},
+            {out},
+            "ops.mul",
+            static_cast<Nvf::TensorView* (*)(Nvf::TensorView*, Nvf::Val*)>(
+                Nvf::mul)));
+    std::unique_ptr<RecordFunctor> test_record3(
+        new OpRecord<Nvf::TensorView*, Nvf::TensorView*, Nvf::Val*>(
+            {t0, s1},
+            {out},
+            "ops.mul",
+            static_cast<Nvf::TensorView* (*)(Nvf::TensorView*, Nvf::Val*)>(
+                Nvf::mul)));
+
+    EXPECT_TRUE(*test_record1 == *test_record2);
+    EXPECT_TRUE(*test_record1 == *test_record3);
+    EXPECT_TRUE(*test_record2 == *test_record3);
+  }
+
+  // CastOpRecord Equality Check
+  {
+    auto t0 = nvfuser::State(0, StateType::Tensor);
+    auto out = nvfuser::State(1, StateType::Tensor);
+    std::unique_ptr<RecordFunctor> test_record1(
+        new CastOpRecord<Nvf::TensorView*, Nvf::TensorView*>(
+            {t0},
+            {out},
+            "ops.cast",
+            static_cast<Nvf::TensorView* (*)(Nvf::DataType, Nvf::TensorView*)>(
+                Nvf::castOp),
+            Nvf::DataType::Half));
+    std::unique_ptr<RecordFunctor> test_record2(
+        new CastOpRecord<Nvf::TensorView*, Nvf::TensorView*>(
+            {t0},
+            {out},
+            "ops.cast",
+            static_cast<Nvf::TensorView* (*)(Nvf::DataType, Nvf::TensorView*)>(
+                Nvf::castOp),
+            Nvf::DataType::Half));
+    std::unique_ptr<RecordFunctor> test_record3(
+        new CastOpRecord<Nvf::TensorView*, Nvf::TensorView*>(
+            {t0},
+            {out},
+            "ops.cast",
+            static_cast<Nvf::TensorView* (*)(Nvf::DataType, Nvf::TensorView*)>(
+                Nvf::castOp),
+            Nvf::DataType::Half));
+
+    EXPECT_TRUE(*test_record1 == *test_record2);
+    EXPECT_TRUE(*test_record1 == *test_record3);
+    EXPECT_TRUE(*test_record2 == *test_record3);
+  }
+
+  // ReductionOpRecord Equality Check
+  {
+    auto t0 = nvfuser::State(0, StateType::Tensor);
+    auto out = nvfuser::State(1, StateType::Tensor);
+    std::unique_ptr<RecordFunctor> test_record1(new ReductionOpRecord(
+        {t0},
+        {out},
+        "ops.sum",
+        static_cast<
+            Nvf::
+                TensorView* (*)(Nvf::TensorView*, const std::vector<int>&, bool, Nvf::DataType)>(
+            Nvf::sum),
+        {0},
+        false,
+        Nvf::DataType::Float));
+    std::unique_ptr<RecordFunctor> test_record2(new ReductionOpRecord(
+        {t0},
+        {out},
+        "ops.sum",
+        static_cast<
+            Nvf::
+                TensorView* (*)(Nvf::TensorView*, const std::vector<int>&, bool, Nvf::DataType)>(
+            Nvf::sum),
+        {0},
+        false,
+        Nvf::DataType::Float));
+    std::unique_ptr<RecordFunctor> test_record3(new ReductionOpRecord(
+        {t0},
+        {out},
+        "ops.sum",
+        static_cast<
+            Nvf::
+                TensorView* (*)(Nvf::TensorView*, const std::vector<int>&, bool, Nvf::DataType)>(
+            Nvf::sum),
+        {0},
+        false,
+        Nvf::DataType::Float));
+
+    EXPECT_TRUE(*test_record1 == *test_record2);
+    EXPECT_TRUE(*test_record1 == *test_record3);
+    EXPECT_TRUE(*test_record2 == *test_record3);
+  }
+}
+
+} // namespace jit
+} // namespace torch
+#endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/utils.cpp b/torch/csrc/jit/codegen/cuda/utils.cpp
index 5bde5e852348..d7409c98db65 100644
--- a/torch/csrc/jit/codegen/cuda/utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/utils.cpp
@@ -18,6 +18,7 @@ auto parseDebugDumpOptions() {
   std::unordered_map<DebugDumpOption, bool> options_map = {
       {DebugDumpOption::FusionIr, false},
       {DebugDumpOption::FusionIrMath, false},
+      {DebugDumpOption::FusionIrPresched, false},
       {DebugDumpOption::KernelIr, false},
       {DebugDumpOption::ComputeAtMap, false},
       {DebugDumpOption::CudaKernel, false},
@@ -37,6 +38,8 @@ auto parseDebugDumpOptions() {
       {DebugDumpOption::ParallelDimensions, false},
       {DebugDumpOption::Halo, false},
       {DebugDumpOption::PerfDebugVerbose, false},
+      {DebugDumpOption::PythonDefinition, false},
+      {DebugDumpOption::PythonFrontendDebug, false},
       {DebugDumpOption::TransformPropagator, false},
       {DebugDumpOption::InlinePropagator, false},
       {DebugDumpOption::Cubin, false},
@@ -51,6 +54,8 @@ auto parseDebugDumpOptions() {
         options_map[DebugDumpOption::FusionIr] = true;
       } else if (token == "fusion_ir_math") {
         options_map[DebugDumpOption::FusionIrMath] = true;
+      } else if (token == "fusion_ir_presched") {
+        options_map[DebugDumpOption::FusionIrPresched] = true;
       } else if (token == "kernel_ir") {
         options_map[DebugDumpOption::KernelIr] = true;
       } else if (token == "ca_map") {
@@ -89,6 +94,10 @@ auto parseDebugDumpOptions() {
         options_map[DebugDumpOption::Halo] = true;
       } else if (token == "perf_debug_verbose") {
         options_map[DebugDumpOption::PerfDebugVerbose] = true;
+      } else if (token == "python_definition") {
+        options_map[DebugDumpOption::PythonDefinition] = true;
+      } else if (token == "python_frontend_debug") {
+        options_map[DebugDumpOption::PythonFrontendDebug] = true;
       } else if (token == "transform_propagator") {
         options_map[DebugDumpOption::TransformPropagator] = true;
       } else if (token == "inline_propagator") {
@@ -103,11 +112,12 @@ auto parseDebugDumpOptions() {
             "Invalid debug dump option: '",
             token,
             "'\nAvailable options:\n",
-            "\tfusion_ir, fusion_ir_math, kernel_ir, ca_map, cuda_kernel, cuda_full,\n",
-            "\tcuda_to_file, debug_info, launch_param, segmented_fusion, fusion_args,\n",
-            "\tkernel_args, dump_eff_bandwidth, draw_segmented_fusion,\n",
-            "\tscheduler_params, parallel_dimensions, buffer_reuse_verbose,\n",
-            "\tptxas_verbose, halo, segmenter_logging, perf_debug_verbose\n",
+            "\tfusion_ir, fusion_ir_math, fusion_ir_presched, kernel_ir, ca_map,\n",
+            "\tcuda_kernel, cuda_full, cuda_to_file, debug_info, launch_param,\n",
+            "\tsegmented_fusion, fusion_args, kernel_args, dump_eff_bandwidth,\n",
+            "\tdraw_segmented_fusion, scheduler_params, parallel_dimensions,\n",
+            "\tbuffer_reuse_verbose, ptxas_verbose, halo, segmenter_logging,\n",
+            "\tperf_debug_verbose, python_definition, python_frontend_debug,\n",
             "\ttransform_propagator, inline_propagator, cubin, ptx\n");
       }
       options_view = (end_pos != c10::string_view::npos)
diff --git a/torch/csrc/jit/codegen/cuda/utils.h b/torch/csrc/jit/codegen/cuda/utils.h
index 43b4358cf59b..5b5c794f3810 100644
--- a/torch/csrc/jit/codegen/cuda/utils.h
+++ b/torch/csrc/jit/codegen/cuda/utils.h
@@ -30,6 +30,7 @@ KernelIndexMode collectIndexMode(const at::ArrayRef<at::IValue>& inputs);
 enum class DebugDumpOption {
   FusionIr, //!< Dump the Fusion IR before lowering
   FusionIrMath, //!< Dump just the compute (math) part of the Fusion IR
+  FusionIrPresched, //!< Dump the Fusion IR before it is scheduled.
   KernelIr, //!< Dump the compiler Kernel IR
   ComputeAtMap, //!< Dump the computeAt map
   CudaKernel, //!< Dump the generated CUDA C++ kernel code
@@ -52,6 +53,8 @@ enum class DebugDumpOption {
   Halo, //! Halo information of tensors
   PerfDebugVerbose, //! When running kernels, print verbose information
                     //! associated with what's running
+  PythonDefinition, //! Python Frontend Fusion Definition.
+  PythonFrontendDebug, //! Python Frontend debug information.
   TransformPropagator, //! When running TransformPropagator, print propagation
                        //! path and replay result
   InlinePropagator, //! When running InlinePropagator, print propagation
diff --git a/torch/csrc/jit/mobile/model_tracer/TracerRunner.cpp b/torch/csrc/jit/mobile/model_tracer/TracerRunner.cpp
index 47b3a7d9c840..20424df8e0a3 100644
--- a/torch/csrc/jit/mobile/model_tracer/TracerRunner.cpp
+++ b/torch/csrc/jit/mobile/model_tracer/TracerRunner.cpp
@@ -267,6 +267,10 @@ void run_model(
 }
 
 TracerResult trace_run(const std::string& input_module_path) {
+  return trace_run(std::vector<std::string>(1, input_module_path));
+}
+
+TracerResult trace_run(const std::vector<std::string>& input_module_paths) {
   at::globalContext().setQEngine(at::QEngine::QNNPACK);
   c10::ObservedOperators::getUnobservedOperatorList().clear();
 
@@ -283,19 +287,24 @@ TracerResult trace_run(const std::string& input_module_path) {
 
   using torch::jit::MobileModuleLoadOptions;
 
-  // run with QNNPACK
-  run_model(input_module_path, root_ops, enabled_backends, called_kernel_tags);
-  // Not every model can be successfully run with fbgemm,
-  // but for those that can this can help broaden the tracers scope around hyper
-  // optimized QNNPack paths
-  try {
-    at::globalContext().setQEngine(at::QEngine::FBGEMM);
+  for (auto& input_module_path : input_module_paths) {
+    // run with QNNPACK
+    at::globalContext().setQEngine(at::QEngine::QNNPACK);
+
     run_model(
         input_module_path, root_ops, enabled_backends, called_kernel_tags);
-  } catch (std::exception& ex) {
-    std::cerr
-        << "ModelTracer encountered an error while attempting to run the model in FBGEMM mode"
-        << ex.what() << "\n Skipping FBGEMM execution" << std::endl;
+    // Not every model can be successfully run with fbgemm,
+    // but for those that can this can help broaden the tracers scope around
+    // hyper optimized QNNPack paths
+    try {
+      at::globalContext().setQEngine(at::QEngine::FBGEMM);
+      run_model(
+          input_module_path, root_ops, enabled_backends, called_kernel_tags);
+    } catch (std::exception& ex) {
+      std::cerr
+          << "ModelTracer encountered an error while attempting to run the model in FBGEMM mode"
+          << ex.what() << "\n Skipping FBGEMM execution" << std::endl;
+    }
   }
 
   op_tracer.getCalledOperators().withLock(
diff --git a/torch/csrc/jit/mobile/model_tracer/TracerRunner.h b/torch/csrc/jit/mobile/model_tracer/TracerRunner.h
index 1c407e7fa04d..9449071413f0 100644
--- a/torch/csrc/jit/mobile/model_tracer/TracerRunner.h
+++ b/torch/csrc/jit/mobile/model_tracer/TracerRunner.h
@@ -1,5 +1,9 @@
 #pragma once
 
+#include <set>
+#include <string>
+#include <vector>
+
 #include <ATen/core/ivalue.h>
 #include <torch/csrc/jit/mobile/model_tracer/BuildFeatureTracer.h>
 #include <torch/csrc/jit/mobile/model_tracer/CustomClassTracer.h>
@@ -24,7 +28,16 @@ struct TracerResult {
   std::set<std::string> enabled_backends;
 };
 
+/**
+ * Trace a single model and return the TracerResult.
+ */
 TracerResult trace_run(const std::string& input_module_path);
+
+/**
+ * Trace multiple models and return the TracerResult.
+ */
+TracerResult trace_run(const std::vector<std::string>& input_module_paths);
+
 } // namespace mobile
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/mobile/model_tracer/tracer.cpp b/torch/csrc/jit/mobile/model_tracer/tracer.cpp
index e20df510ab02..5c777fcbecff 100644
--- a/torch/csrc/jit/mobile/model_tracer/tracer.cpp
+++ b/torch/csrc/jit/mobile/model_tracer/tracer.cpp
@@ -1,11 +1,12 @@
 #include <iostream>
+#include <sstream>
 #include <string>
 
 /**
- * The tracer.cpp generates a binary that accepts a TorchScript model or a
- * Torch Mobile Model (with bytecode.pkl) which has at least 1 bundled
- * input. This binary then feeds the bundled input(s) into the model
- * and executes using the lite interpreter.
+ * The tracer.cpp generates a binary that accepts multiple Torch Mobile Model(s)
+ * (with bytecode.pkl), each of which has at least 1 bundled
+ * input. This binary then feeds the bundled input(s) into each corresponding
+ * model and executes it using the lite interpreter.
  *
  * Both root operators as well as called operators are recorded and saved
  * into a YAML file (whose path is provided on the command line).
@@ -33,7 +34,7 @@ typedef std::map<std::string, std::set<std::string>> kt_type;
 C10_DEFINE_string(
     model_input_path,
     "",
-    "The path of the input model file (.ptl).");
+    "A comma separated list of path(s) to the input model file(s) (.ptl).");
 
 C10_DEFINE_string(
     build_yaml_path,
@@ -82,10 +83,40 @@ void printOpsYAML(
   }
 }
 
+void printDTypeYAML(
+    std::ostream& out,
+    int indent,
+    const std::string& kernel_tag_name,
+    const std::set<std::string> dtypes) {
+  std::string indent_str = std::string(indent, ' ');
+  out << indent_str << kernel_tag_name << ":" << std::endl;
+  for (auto& dtype : dtypes) {
+    out << indent_str << "- " << dtype << std::endl;
+  }
+}
+
+void printDTypesYAML(
+    std::ostream& out,
+    const torch::jit::mobile::KernelDTypeTracer::kernel_tags_type&
+        kernel_tags) {
+  for (auto& it : kernel_tags) {
+    printDTypeYAML(out, 2, it.first, it.second);
+  }
+}
+
+void printCustomClassesYAML(
+    std::ostream& out,
+    const torch::jit::mobile::CustomClassTracer::custom_classes_type&
+        loaded_classes) {
+  for (auto& class_name : loaded_classes) {
+    out << "- " << class_name << std::endl;
+  }
+}
+
 /**
- * Converts a pytorch model (full/lite) to lite interpreter model for
- * mobile, and additionally writes out a list of root and called
- * operators.
+ * Runs multiple PyTorch lite interpreter models, and additionally writes
+ * out a list of root and called operators, kernel dtypes, and loaded/used
+ * TorchBind custom classes.
  */
 int main(int argc, char* argv[]) {
   if (!c10::ParseCommandLineFlags(&argc, &argv)) {
@@ -96,21 +127,27 @@ int main(int argc, char* argv[]) {
   REQUIRE_STRING_ARG(model_input_path);
   REQUIRE_STRING_ARG(build_yaml_path);
 
-  const std::string input_module_path = FLAGS_model_input_path;
-
+  std::istringstream sin(FLAGS_model_input_path);
   std::ofstream yaml_out(FLAGS_build_yaml_path);
 
-  std::cout << "Processing: " << input_module_path << std::endl;
   std::cout << "Output: " << FLAGS_build_yaml_path << std::endl;
   torch::jit::mobile::TracerResult tracer_result;
+  std::vector<std::string> model_input_paths;
+
+  for (std::string model_input_path;
+       std::getline(sin, model_input_path, ',');) {
+    std::cout << "Processing: " << model_input_path << std::endl;
+    model_input_paths.push_back(model_input_path);
+  }
+
   try {
-    tracer_result = torch::jit::mobile::trace_run(FLAGS_model_input_path);
+    tracer_result = torch::jit::mobile::trace_run(model_input_paths);
   } catch (std::exception& ex) {
     std::cerr
         << "ModelTracer has not been able to load the module for the following reasons:\n"
         << ex.what()
-        << "\nPlease consider posting to the PyTorch with the error message."
-        << std::endl;
+        << "\nPlease consider opening an issue at https://github.com/pytorch/pytorch/issues "
+        << "with the detailed error message." << std::endl;
 
     throw ex;
   }
@@ -135,7 +172,8 @@ int main(int argc, char* argv[]) {
     }
   }
 
-  yaml_out << "include_all_non_op_selectives: true" << std::endl;
+  yaml_out << "include_all_non_op_selectives: false" << std::endl;
+  yaml_out << "build_features: []" << std::endl;
   yaml_out << "operators:" << std::endl;
   printOpsYAML(
       yaml_out,
@@ -149,5 +187,20 @@ int main(int argc, char* argv[]) {
       false /* is_used_for_training */,
       false /* is_root_operator */,
       false /* include_all_overloads */);
+
+  yaml_out << "kernel_metadata:";
+  if (tracer_result.called_kernel_tags.empty()) {
+    yaml_out << " []";
+  }
+  yaml_out << std::endl;
+  printDTypesYAML(yaml_out, tracer_result.called_kernel_tags);
+
+  yaml_out << "custom_classes:";
+  if (tracer_result.loaded_classes.empty()) {
+    yaml_out << " []";
+  }
+  yaml_out << std::endl;
+  printCustomClassesYAML(yaml_out, tracer_result.loaded_classes);
+
   return 0;
 }
diff --git a/torch/csrc/jit/mobile/promoted_prim_ops.cpp b/torch/csrc/jit/mobile/promoted_prim_ops.cpp
index 1572e7f147f2..c5a73f34f5c7 100644
--- a/torch/csrc/jit/mobile/promoted_prim_ops.cpp
+++ b/torch/csrc/jit/mobile/promoted_prim_ops.cpp
@@ -67,12 +67,22 @@ void sym_size_int(Stack& stack) {
   auto t = pop(stack).toTensor();
   push(stack, t.sym_sizes()[dim]);
 }
+void sym_stride_int(Stack& stack) {
+  auto dim = pop(stack).toInt();
+  auto t = pop(stack).toTensor();
+  push(stack, t.sym_strides()[dim]);
+}
 
 void sym_numel(Stack& stack) {
   auto t = std::move(pop(stack)).toTensor();
   push(stack, t.sym_numel());
 }
 
+void sym_storage_offset(Stack& stack) {
+  auto t = std::move(pop(stack)).toTensor();
+  push(stack, t.sym_storage_offset());
+}
+
 void sym_stride(Stack& stack) {
   auto t = std::move(pop(stack)).toTensor();
   pack(stack, t.sym_strides().vec());
diff --git a/torch/csrc/jit/mobile/promoted_prim_ops.h b/torch/csrc/jit/mobile/promoted_prim_ops.h
index 901a396d2cdd..352868359687 100644
--- a/torch/csrc/jit/mobile/promoted_prim_ops.h
+++ b/torch/csrc/jit/mobile/promoted_prim_ops.h
@@ -23,8 +23,12 @@ void sym_size(Stack& stack);
 
 void sym_size_int(Stack& stack);
 
+void sym_stride_int(Stack& stack);
+
 void sym_numel(Stack& stack);
 
+void sym_storage_offset(Stack& stack);
+
 void sym_stride(Stack& stack);
 
 void device(Stack& stack);
diff --git a/torch/csrc/jit/passes/normalize_ops.cpp b/torch/csrc/jit/passes/normalize_ops.cpp
index 0a7d7f6cf661..497f41bcbd51 100644
--- a/torch/csrc/jit/passes/normalize_ops.cpp
+++ b/torch/csrc/jit/passes/normalize_ops.cpp
@@ -129,6 +129,7 @@ const std::unordered_map<Symbol, Symbol>& getOperatorAliasMap() {
       {aten::true_divide, aten::div},
       {aten::true_divide_, aten::div_},
       {aten::concat, aten::cat},
+      {aten::concatenate, aten::cat},
       {aten::row_stack, aten::vstack},
       {aten::swapdims, aten::transpose},
       {aten::swapdims_, aten::transpose_},
diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index 19c0d0b36a1a..ea183ae53946 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -243,7 +243,8 @@ void NodeToONNX(
     std::unordered_map<Value*, Value*>& env) {
   py::object onnx = py::module::import("torch.onnx");
   py::object onnx_globals = py::module::import("torch.onnx._globals");
-  py::object onnx_registry = py::module::import("torch.onnx.symbolic_registry");
+  py::object onnx_registration =
+      py::module::import("torch.onnx._internal.registration");
 
   // Setup all the lambda helper functions.
 
@@ -452,10 +453,13 @@ void NodeToONNX(
 
     py::object opset_version =
         onnx_globals.attr("GLOBALS").attr("export_onnx_opset_version");
-    py::object is_registered_op = onnx_registry.attr("is_registered_op")(
-        "PythonOp", "prim", opset_version);
-    if (!py::hasattr(pyobj, "symbolic") &&
-        (!PyObject_IsTrue(is_registered_op.ptr()))) {
+    // NOTE(justinchuby): Call the internal registry to register the symbolic
+    // method defined in the module.
+    bool is_registered_op =
+        onnx_registration.attr("registry")
+            .attr("is_registered_op")("prim::PythonOp", opset_version)
+            .cast<bool>();
+    if (!py::hasattr(pyobj, "symbolic") && !is_registered_op) {
       // Inline the subgraph within the prim::PythonOp unless
       // either of these conditions are satisfied
       // 1. The torch.autograd.Function class of this node object has `symbolic`
@@ -514,8 +518,16 @@ void NodeToONNX(
       // Call the symbolic function
       // Use a little trampoline function so we can give good error messages
       // upon argument mismatch
-      onnx_registry.attr("register_op")(
-          op->name(), pyobj.attr("symbolic"), "", opset_version);
+      // Register as a custom operator
+      // TODO: Find a more elegant way to do this without having to touch
+      // internal Python modules.
+      // TODO(justinchuby): Define a namespace for these Python Ops.
+      onnx_registration.attr("registry")
+          .attr("register")(
+              "::" + op->name(),
+              opset_version,
+              pyobj.attr("symbolic"),
+              /* custom */ true);
       py::object raw_output = onnx.attr("_run_symbolic_method")(
           new_block->owningGraph(),
           op->name(),
@@ -524,7 +536,7 @@ void NodeToONNX(
 
       processSymbolicOutput(op->name(), op, raw_output);
     } else {
-      TORCH_INTERNAL_ASSERT(PyObject_IsTrue(is_registered_op.ptr()));
+      TORCH_INTERNAL_ASSERT(is_registered_op);
       Node* n = static_cast<Node*>(op);
       n->s_(attr::name, op->name());
       // Call symbolic function
diff --git a/torch/csrc/jit/passes/onnx/constant_fold.cpp b/torch/csrc/jit/passes/onnx/constant_fold.cpp
index 49694d57ef10..d515afa5d8e1 100644
--- a/torch/csrc/jit/passes/onnx/constant_fold.cpp
+++ b/torch/csrc/jit/passes/onnx/constant_fold.cpp
@@ -202,7 +202,7 @@ c10::optional<at::Tensor> runTorchSlice_opset10(
 at::Tensor runTorchArange_opset11(
     const Node* node,
     const std::vector<at::Tensor>& inputTensorValues) {
-  AT_ASSERT(inputTensorValues.size() == 3);
+  TORCH_INTERNAL_ASSERT(inputTensorValues.size() == 3);
   auto dtype = inputTensorValues[0].scalar_type();
   at::Tensor updated_val;
   switch (dtype) {
@@ -575,7 +575,7 @@ std::vector<at::Tensor> getValues(
           "getValues: Unsupported kind of constant node found.");
     }
   }
-  AT_ASSERT(inputTensorValues.size() == numInputs);
+  TORCH_INTERNAL_ASSERT(inputTensorValues.size() == numInputs);
   return inputTensorValues;
 }
 
@@ -618,7 +618,7 @@ void ConstantFoldONNX(Block* b, ParamMap& paramsDict, int opset_version) {
         "Constant folding not applied.");
     return;
   }
-  AT_ASSERT(b->param_node());
+  TORCH_INTERNAL_ASSERT(b->param_node());
   auto valsToParamsMap = buildValueToParamsMap(b, paramsDict);
   // Only the root block is constant-folded. Folding nested blocks is
   // not supported for now.
diff --git a/torch/csrc/jit/passes/onnx/function_substitution.cpp b/torch/csrc/jit/passes/onnx/function_substitution.cpp
index 2b7b68241556..a5dd1d879370 100644
--- a/torch/csrc/jit/passes/onnx/function_substitution.cpp
+++ b/torch/csrc/jit/passes/onnx/function_substitution.cpp
@@ -85,7 +85,7 @@ void functionCallSubstitution(Block* block) {
     Node* cur = *it++;
     switch (cur->kind()) {
       case prim::CallFunction: {
-        AT_ASSERT(cur->input(0)->node()->kind() == prim::Constant);
+        TORCH_INTERNAL_ASSERT(cur->input(0)->node()->kind() == prim::Constant);
         auto function_constant = cur->input(0)->node();
         auto fun_type =
             function_constant->output()->type()->expect<FunctionType>();
diff --git a/torch/csrc/jit/passes/onnx/helper.cpp b/torch/csrc/jit/passes/onnx/helper.cpp
index 77b58f037d59..a1ea88ae6572 100644
--- a/torch/csrc/jit/passes/onnx/helper.cpp
+++ b/torch/csrc/jit/passes/onnx/helper.cpp
@@ -89,7 +89,11 @@ c10::optional<at::ScalarType> ONNXTypeToATenType(int32_t onnx_type) {
     case ::ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16:
       return at::kBFloat16;
     default:
-      TORCH_CHECK(false, "unexpected tensor scalar type");
+      TORCH_CHECK(
+          false,
+          "ONNX type ",
+          onnx_type,
+          " is an unexpected tensor scalar type");
   }
   return c10::optional<at::ScalarType>{};
 }
@@ -135,7 +139,11 @@ ::ONNX_NAMESPACE::TensorProto_DataType ATenTypeToOnnxType_aux(
     case at::kQInt32:
       return ::ONNX_NAMESPACE::TensorProto_DataType_INT32;
     default:
-      AT_ERROR("unexpected tensor scalar type");
+      TORCH_CHECK(
+          false,
+          "ScalarType ",
+          toString(at_type),
+          " is an unexpected tensor scalar type");
   }
 }
 } // namespace
diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp
index 9098146db3c8..2280ea6eb30b 100644
--- a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp
+++ b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp
@@ -130,7 +130,7 @@ std::unordered_map<int64_t, ConvertedIndex> MergeSliceAndSelectToIndices(
       cur_dim++;
     }
 
-    AT_ASSERT(cur_dim == dim);
+    TORCH_INTERNAL_ASSERT(cur_dim == dim);
     if (node->kind() == aten::slice) {
       auto size = CreateSizeOfDim(orig_data, dim, index_put_node);
       auto index_tensor = ConvertSliceToIndex(node, size, index_put_node);
@@ -165,7 +165,7 @@ std::unordered_map<int64_t, ConvertedIndex> MergeSliceAndSelectToIndices(
   }
 
   // Each dimension should have its associated index tensor.
-  AT_ASSERT((int64_t)dim_index_map.size() == cur_dim);
+  TORCH_INTERNAL_ASSERT((int64_t)dim_index_map.size() == cur_dim);
   return dim_index_map;
 }
 
@@ -190,7 +190,7 @@ std::vector<Value*> ReshapeToAdvancedIndexingFormat(
   size_t tensor_ind_count = 0;
   for (const auto i : c10::irange(dim_index_map.size())) {
     auto index_i = dim_index_map.find(i);
-    AT_ASSERT(index_i != dim_index_map.end());
+    TORCH_INTERNAL_ASSERT(index_i != dim_index_map.end());
     if (index_i->second.orig_node_kind == aten::index) {
       if (i < min_index_dim)
         min_index_dim = i;
@@ -212,7 +212,7 @@ std::vector<Value*> ReshapeToAdvancedIndexingFormat(
   for (const auto i : c10::irange(dim_index_map.size())) {
     size_t ind_size = 0;
     auto index_i = dim_index_map.find(i);
-    AT_ASSERT(index_i != dim_index_map.end());
+    TORCH_INTERNAL_ASSERT(index_i != dim_index_map.end());
     Value* index = index_i->second.index;
     switch (index_i->second.orig_node_kind) {
       case aten::select:
diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp
index e5efc704a410..8fa08c110b6c 100644
--- a/torch/csrc/jit/passes/onnx/peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/peephole.cpp
@@ -56,11 +56,11 @@ bool isNopTranspose(const std::vector<int64_t>& perm) {
 std::vector<int64_t> composeTransposes(
     const std::vector<int64_t>& t1,
     const std::vector<int64_t>& t2) {
-  AT_ASSERT(t1.size() == t2.size());
+  TORCH_INTERNAL_ASSERT(t1.size() == t2.size());
   std::vector<int64_t> ret;
   ret.reserve(t1.size());
   for (const auto& i : t2) {
-    AT_ASSERT(i < int64_t(t1.size()));
+    TORCH_INTERNAL_ASSERT(i < int64_t(t1.size()));
     ret.push_back(t1[i]);
   }
   return ret;
@@ -131,7 +131,7 @@ void fuseBroadcast(Block* b) {
 
     auto broadcast_positions = getBroadcastPositions(n);
     if (!broadcast_positions.empty()) {
-      AT_ASSERT(!n->hasAttribute(attr::axis));
+      TORCH_INTERNAL_ASSERT(!n->hasAttribute(attr::axis));
     }
 
     for (size_t position : broadcast_positions) {
@@ -627,7 +627,7 @@ static void speculateOps(Block* block) {
 static void replaceInputWithList(Node* node, size_t i, ArrayRef<Value*> to) {
   node->removeInput(i);
   for (auto* to_val : to) {
-    AT_ASSERT(to_val->owningGraph() == node->owningGraph());
+    TORCH_INTERNAL_ASSERT(to_val->owningGraph() == node->owningGraph());
     node->insertInput(i++, to_val);
   }
 }
diff --git a/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp b/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp
index 3571199442d4..c798d2533498 100644
--- a/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp
+++ b/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp
@@ -75,7 +75,7 @@ static bool IsComparisonOp(const NodeKind& nkind) {
 static TensorTypePtr CreateProfiledTensorTypeWithScalarType(
     const TensorTypePtr& typePtr,
     const c10::ScalarType& scalar_type) {
-  AT_ASSERT(typePtr != nullptr);
+  TORCH_INTERNAL_ASSERT(typePtr != nullptr);
   return typePtr->withScalarType({scalar_type});
 }
 
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 988675504fc5..e13237e55f32 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -156,6 +156,11 @@ class PythonSymIntNodeImpl : public c10::SymIntNodeImpl {
     return getPyObj().attr("__bool__")().is(py::handle(Py_True));
   }
 
+  virtual int64_t guard_int(const char* file, int64_t line) override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("guard_int")(file, line).cast<int64_t>();
+  }
+
   virtual int64_t int_() override {
     py::gil_scoped_acquire acquire;
     return getPyObj().attr("__int__")().cast<int64_t>();
@@ -862,6 +867,9 @@ void initJITBindings(PyObject* module) {
 #if (!defined(FBCODE_CAFFE2) && defined(BUILD_ONEDNN_GRAPH))
       .def("_jit_set_llga_enabled", &RegisterLlgaFuseGraph::setEnabled)
       .def("_jit_llga_enabled", &RegisterLlgaFuseGraph::isEnabled)
+#else
+      .def("_jit_set_llga_enabled", [](bool flag) { return false; })
+      .def("_jit_llga_enabled", []() { return false; })
 #endif
       .def(
           "_jit_set_tracer_state_warn",
@@ -1384,6 +1392,11 @@ void initJITBindings(PyObject* module) {
               })
           .def("__bool__", [](c10::SymIntNode a) { return a->bool_(); })
           .def("__int__", [](c10::SymIntNode a) { return a->int_(); })
+          // Intentionally don't set file line, as the Python backtrace matters
+          // more here
+          .def(
+              "guard_int",
+              [](c10::SymIntNode a) { return a->guard_int(nullptr, 0); })
           .def(
               "__sym_float__",
               [](c10::SymIntNode a) {
@@ -1710,13 +1723,11 @@ void initJITBindings(PyObject* module) {
                     return _get_operation_for_overload_or_packet(
                         {op}, symbol, args, kwargs, /*is_overload*/ true);
                   });
-              auto func_dk =
-                  py::cpp_function([op, symbol, allow_numbers_as_tensors](
-                                       const std::string& str_dk,
-                                       py::args args,
-                                       py::kwargs kwargs) {
+              auto func_dk = py::cpp_function(
+                  [op, symbol, allow_numbers_as_tensors](
+                      c10::DispatchKey dk_, py::args args, py::kwargs kwargs) {
                     c10::optional<c10::DispatchKey> dk =
-                        c10::make_optional(c10::parseDispatchKey(str_dk));
+                        c10::make_optional(dk_);
                     ToIValueAllowNumbersAsTensors g(allow_numbers_as_tensors);
                     return _get_operation_for_overload_or_packet(
                         {op}, symbol, args, kwargs, /*is_overload*/ true, dk);
diff --git a/torch/csrc/jit/python/pybind_utils.cpp b/torch/csrc/jit/python/pybind_utils.cpp
index c45750581a7b..59af31a86ddd 100644
--- a/torch/csrc/jit/python/pybind_utils.cpp
+++ b/torch/csrc/jit/python/pybind_utils.cpp
@@ -209,10 +209,8 @@ IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N) {
     case TypeKind::ListType: {
       // If the object is a ScriptList, retrieve the c10::List
       // instance inside it.
-      try {
-        auto script_list = py::cast<ScriptList>(obj);
-        return script_list.list_;
-      } catch (...) {
+      if (py::isinstance<ScriptList>(obj)) {
+        return py::cast<ScriptList>(obj).list_;
       }
 
       // If not (i.e. it is a regular Python list), make a new
@@ -455,5 +453,290 @@ IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N) {
       "toIValue() cannot handle converting to type: ", type->repr_str()));
 }
 
+py::object toPyObject(IValue ivalue) {
+  if (ivalue.isNone()) {
+    return py::none();
+  } else if (ivalue.isTensor()) {
+    auto tensor = std::move(ivalue).toTensor();
+    if (tensor.unsafeGetTensorImpl()->is_wrapped_number()) {
+      TORCH_INTERNAL_ASSERT(tensor.device().is_cpu());
+      auto py_tensor = py::cast(tensor);
+      if (PyObject_HasAttrString(py_tensor.ptr(), "_wrapped_number")) {
+        return py_tensor.attr("_wrapped_number");
+      }
+      auto scalar_type = tensor.scalar_type();
+      switch (scalar_type) {
+        case at::ScalarType::Bool:
+          return py::cast(*tensor.data_ptr<bool>());
+        case at::ScalarType::Long:
+          return py::cast(*tensor.data_ptr<int64_t>());
+        case at::ScalarType::Double:
+          return py::cast(*tensor.data_ptr<double>());
+        case at::ScalarType::ComplexDouble:
+          // TODO: https://github.com/pytorch/pytorch/issues/77134
+          return py::cast(static_cast<std::complex<double>>(
+              *tensor.data_ptr<c10::complex<double>>()));
+        default:
+          TORCH_CHECK(
+              false,
+              "Missing cases in 'toPyObject' wrapped number handling! Can't convert ",
+              scalar_type,
+              " to a Python object");
+      }
+    } else {
+      guardAgainstNamedTensor<at::Tensor>(tensor);
+      return py::cast(autograd::Variable(std::move(tensor)));
+    }
+  } else if (ivalue.isStorage()) {
+    return py::cast(ivalue.toStorage());
+  } else if (ivalue.isGenerator()) {
+    return py::cast(ivalue.toGenerator());
+  } else if (ivalue.isDouble()) {
+    return py::cast(std::move(ivalue).toDouble());
+  } else if (ivalue.isComplexDouble()) {
+    return py::cast(
+        static_cast<std::complex<double>>(std::move(ivalue).toComplexDouble()));
+  } else if (ivalue.isInt()) {
+    return py::cast(std::move(ivalue).toInt());
+  } else if (ivalue.isBool()) {
+    return py::cast(std::move(ivalue).toBool());
+  } else if (ivalue.isString()) {
+    return py::cast(std::move(ivalue).toStringRef());
+  } else if (ivalue.isList()) {
+    auto list = std::move(ivalue).toList();
+    py::list t{list.size()};
+    for (const auto i : c10::irange(list.size())) {
+      t[i] = toPyObject(IValue{list.get(i)});
+    }
+    return std::move(t);
+  } else if (ivalue.isTuple()) {
+    auto tuple = std::move(ivalue).toTuple();
+    const auto& elements = tuple->elements();
+
+    py::tuple t{elements.size()};
+    for (const auto i : c10::irange(elements.size())) {
+      t[i] = toPyObject(IValue{elements.at(i)});
+    }
+
+    // If we have a NamedTuple
+    if (tuple->type() && tuple->type()->schema() &&
+        tuple->type()->schema()->name() != "") {
+      auto unqualName = tuple->type()->name()->name();
+
+      const std::vector<Argument>& tuple_args =
+          tuple->type()->schema()->arguments();
+
+      std::vector<pybind11::object> defaults;
+      auto it = std::find_if(
+          tuple_args.begin(), tuple_args.end(), [](const Argument& arg) {
+            return arg.default_value().has_value();
+          });
+      std::transform(
+          it,
+          tuple_args.end(),
+          std::back_inserter(defaults),
+          [](const Argument& arg) { return toPyObject(*arg.default_value()); });
+
+      std::vector<std::string> fieldNames =
+          fmap(tuple_args, [](const Argument& arg) { return arg.name(); });
+
+      return py::module::import("torch._jit_internal")
+          .attr("_create_named_tuple")(
+              t, unqualName, fieldNames, py::make_tuple(defaults));
+    } else {
+      return std::move(t);
+    }
+  } else if (ivalue.isDevice()) {
+    return py::cast<py::object>(THPDevice_New(std::move(ivalue).toDevice()));
+  } else if (ivalue.isGenericDict()) {
+    auto dict = std::move(ivalue).toGenericDict();
+    py::dict py_dict;
+    for (auto& pair : dict) {
+      py_dict[toPyObject(IValue{pair.key()})] =
+          toPyObject(IValue{pair.value()});
+    }
+    return std::move(py_dict);
+  } else if (ivalue.isRRef()) {
+#ifdef USE_RPC
+    auto RRefPtr =
+        c10::dynamic_intrusive_pointer_cast<torch::distributed::rpc::RRef>(
+            std::move(ivalue).toRRef());
+    return py::cast(torch::distributed::rpc::PyRRef(RRefPtr));
+#else
+    AT_ERROR("RRef is only supported with the distributed package");
+#endif
+  } else if (ivalue.isObject()) {
+    const auto obj = std::move(ivalue).toObject();
+    if (obj->type()->is_module()) {
+      return py::cast(Module(obj));
+    }
+
+    auto pyCu = get_python_cu();
+    if (obj->name().find("__torch__.torch.classes") == 0) {
+      return py::cast(Object(obj));
+    }
+    const auto classType = pyCu->get_class(c10::QualifiedName(obj->name()));
+    AT_ASSERT(classType);
+    auto pyClass = getScriptedClassOrError(obj->type());
+    auto pyObj = pyClass.attr("__new__")(pyClass);
+
+    const auto numAttrs = classType->numAttributes();
+
+    for (const auto slot : c10::irange(numAttrs)) {
+      const auto& attrName = classType->getAttributeName(slot);
+      IValue v = obj->getSlot(slot);
+      py::setattr(pyObj, attrName.c_str(), toPyObject(std::move(v)));
+    }
+    return pyObj;
+  } else if (ivalue.isPyObject()) {
+    // return borrowed reference to ensure it correctly incref the underlying
+    // PyObject
+    return py::reinterpret_borrow<py::object>(ivalue.toPyObject());
+  } else if (ivalue.isCapsule()) {
+    return py::cast(c10::Capsule(ivalue.toCapsule()));
+  } else if (ivalue.isFuture()) {
+    return py::cast(std::make_shared<PythonFutureWrapper>(ivalue.toFuture()));
+  } else if (ivalue.isEnum()) {
+    auto enum_holder = ivalue.toEnumHolder();
+    auto py_class = getScriptedClassOrError(enum_holder->type());
+    return py_class.attr(enum_holder->name().c_str());
+  } else if (ivalue.isRRef()) {
+#ifdef USE_RPC
+    return py::cast(torch::distributed::rpc::PyRRef(
+        c10::static_intrusive_pointer_cast<distributed::rpc::RRef>(
+            ivalue.toRRef())));
+#else
+    TORCH_CHECK(false, "RRef is only supported with the distributed package");
+#endif
+  } else if (ivalue.isSymInt()) {
+    auto si = ivalue.toSymInt();
+    return py::cast(si);
+  } else {
+    AT_ERROR(
+        "Missing cases in 'toPyObject'! Can't convert ",
+        ivalue.tagKind(),
+        " to a Python object");
+  }
+}
+
+std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
+    const std::vector<std::shared_ptr<Operator>>& operations,
+    py::args args,
+    const py::kwargs& kwargs) {
+  Stack stack;
+  if (operations.size() == 1) {
+    std::shared_ptr<Operator> op = operations.at(0);
+    // Create a stack full of the arguments and keyword arguments.
+    stack = createStackForSchema(
+        op->schema(), std::move(args), kwargs, c10::nullopt);
+
+    return std::make_pair(op, stack);
+  } else {
+    std::vector<schema_match_error> errors;
+    std::shared_ptr<Operator> found_op = nullptr;
+    for (const auto& op : operations) {
+      try {
+        stack = createStackForSchema(op->schema(), args, kwargs, c10::nullopt);
+        found_op = op;
+        break;
+      } catch (schema_match_error& error) {
+        errors.push_back(std::move(error));
+      }
+    }
+    if (!found_op) {
+      std::stringstream ss;
+      ss << "Overloaded torch operator invoked from Python failed to many any schema:\n";
+      for (const auto& err : errors) {
+        ss << err.what() << "\n\n";
+      }
+      throw std::runtime_error(ss.str());
+    }
+
+    return std::make_pair(found_op, stack);
+  }
+}
+
+py::object invokeOperatorFromPython(
+    const std::vector<std::shared_ptr<Operator>>& operations,
+    py::args args,
+    const py::kwargs& kwargs,
+    c10::optional<c10::DispatchKey> dk) {
+  auto opWithStack = getOpWithStack(operations, args, kwargs);
+  std::shared_ptr<Operator> found_op = std::get<0>(opWithStack);
+  Stack stack = std::get<1>(opWithStack);
+  {
+    pybind11::gil_scoped_release no_gil_guard;
+    if (dk) {
+      found_op->getOperationForDispatchKey (*dk)(stack);
+    } else {
+      found_op->getOperation()(stack);
+    }
+  }
+
+  return createPyObjectForStack(std::move(stack));
+}
+
+py::object _get_operation_for_overload_or_packet(
+    const std::vector<std::shared_ptr<Operator>>& operations,
+    Symbol symbol,
+    py::args args,
+    const py::kwargs& kwargs,
+    bool is_overload,
+    c10::optional<c10::DispatchKey> dk) {
+  std::vector<py::handle> overloaded_args;
+  size_t total_arg_num = args.size() + kwargs.size();
+  for (const auto i : c10::irange(args.size())) {
+    is_tensor_and_append_overloaded(args[i].ptr(), &overloaded_args);
+    is_tensor_list_and_append_overloaded(
+        args[i].ptr(),
+        &overloaded_args,
+        static_cast<int>(total_arg_num),
+        false /* throw_error */);
+  }
+  // NB: for kwargs, we cannot guarantee the order of appending
+  // is the same as the argument order in operator's schema.
+  // This is suboptimal, but should be fine. Later when we have
+  // better schema matching and argument parsing, we could
+  // match the operator in `operations` first, then the order will
+  // be guaranteed.
+  for (auto item : kwargs) {
+    is_tensor_and_append_overloaded(item.second.ptr(), &overloaded_args);
+    is_tensor_list_and_append_overloaded(
+        item.second.ptr(),
+        &overloaded_args,
+        total_arg_num,
+        false /* throw_error */);
+  }
+  if (overloaded_args.size() > 0 ||
+      at::impl::PythonTorchFunctionTLS::get_mode()) {
+    py::object ret;
+    std::string ns = symbol.ns().toUnqualString();
+    std::string method_name = symbol.toUnqualString();
+    auto self_func = py::module::import("torch")
+                         .attr("ops")
+                         .attr(ns.c_str())
+                         .attr(method_name.c_str());
+    if (is_overload) {
+      auto overload_name = operations[0]->schema().overload_name();
+      if (overload_name == "") {
+        self_func = self_func.attr("default");
+      } else {
+        self_func = self_func.attr(overload_name.c_str());
+      }
+    }
+    std::string module_name("torch.ops");
+    module_name.append(ns);
+    return pybind11::reinterpret_steal<py::object>(
+        handle_torch_function_no_python_arg_parser(
+            overloaded_args,
+            args.ptr(),
+            kwargs.ptr(),
+            method_name.c_str(),
+            self_func.ptr(),
+            module_name.c_str()));
+  }
+  return invokeOperatorFromPython(operations, args, kwargs, dk);
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index f4f246fd0a33..694d2b8ee489 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -64,7 +64,7 @@ TORCH_API IValue toIValue(
     const TypePtr& type,
     c10::optional<int32_t> N = c10::nullopt);
 
-py::object toPyObject(IValue ivalue);
+TORCH_API py::object toPyObject(IValue ivalue);
 
 // Hack to overload the behavior of toIValue to accept Python
 // numbers in places where a Tensor is expected
@@ -699,172 +699,6 @@ inline py::object getScriptedClassOrError(const c10::NamedTypePtr& classType) {
   return py_class;
 }
 
-inline py::object toPyObject(IValue ivalue) {
-  if (ivalue.isNone()) {
-    return py::none();
-  } else if (ivalue.isTensor()) {
-    auto tensor = std::move(ivalue).toTensor();
-    if (tensor.unsafeGetTensorImpl()->is_wrapped_number()) {
-      TORCH_INTERNAL_ASSERT(tensor.device().is_cpu());
-      auto py_tensor = py::cast(tensor);
-      if (PyObject_HasAttrString(py_tensor.ptr(), "_wrapped_number")) {
-        return py_tensor.attr("_wrapped_number");
-      }
-      auto scalar_type = tensor.scalar_type();
-      switch (scalar_type) {
-        case at::ScalarType::Bool:
-          return py::cast(*tensor.data_ptr<bool>());
-        case at::ScalarType::Long:
-          return py::cast(*tensor.data_ptr<int64_t>());
-        case at::ScalarType::Double:
-          return py::cast(*tensor.data_ptr<double>());
-        case at::ScalarType::ComplexDouble:
-          // TODO: https://github.com/pytorch/pytorch/issues/77134
-          return py::cast(static_cast<std::complex<double>>(
-              *tensor.data_ptr<c10::complex<double>>()));
-        default:
-          TORCH_CHECK(
-              false,
-              "Missing cases in 'toPyObject' wrapped number handling! Can't convert ",
-              scalar_type,
-              " to a Python object");
-      }
-    } else {
-      guardAgainstNamedTensor<at::Tensor>(tensor);
-      return py::cast(autograd::Variable(std::move(tensor)));
-    }
-  } else if (ivalue.isStorage()) {
-    return py::cast(ivalue.toStorage());
-  } else if (ivalue.isGenerator()) {
-    return py::cast(ivalue.toGenerator());
-  } else if (ivalue.isDouble()) {
-    return py::cast(std::move(ivalue).toDouble());
-  } else if (ivalue.isComplexDouble()) {
-    return py::cast(
-        static_cast<std::complex<double>>(std::move(ivalue).toComplexDouble()));
-  } else if (ivalue.isInt()) {
-    return py::cast(std::move(ivalue).toInt());
-  } else if (ivalue.isBool()) {
-    return py::cast(std::move(ivalue).toBool());
-  } else if (ivalue.isString()) {
-    return py::cast(std::move(ivalue).toStringRef());
-  } else if (ivalue.isList()) {
-    auto list = std::move(ivalue).toList();
-    py::list t{list.size()};
-    for (const auto i : c10::irange(list.size())) {
-      t[i] = toPyObject(IValue{list.get(i)});
-    }
-    return std::move(t);
-  } else if (ivalue.isTuple()) {
-    auto tuple = std::move(ivalue).toTuple();
-    const auto& elements = tuple->elements();
-
-    py::tuple t{elements.size()};
-    for (const auto i : c10::irange(elements.size())) {
-      t[i] = toPyObject(IValue{elements.at(i)});
-    }
-
-    // If we have a NamedTuple
-    if (tuple->type() && tuple->type()->schema() &&
-        tuple->type()->schema()->name() != "") {
-      auto unqualName = tuple->type()->name()->name();
-
-      const std::vector<Argument>& tuple_args =
-          tuple->type()->schema()->arguments();
-
-      std::vector<pybind11::object> defaults;
-      auto it = std::find_if(
-          tuple_args.begin(), tuple_args.end(), [](const Argument& arg) {
-            return arg.default_value().has_value();
-          });
-      std::transform(
-          it,
-          tuple_args.end(),
-          std::back_inserter(defaults),
-          [](const Argument& arg) { return toPyObject(*arg.default_value()); });
-
-      std::vector<std::string> fieldNames =
-          fmap(tuple_args, [](const Argument& arg) { return arg.name(); });
-
-      return py::module::import("torch._jit_internal")
-          .attr("_create_named_tuple")(
-              t, unqualName, fieldNames, py::make_tuple(defaults));
-    } else {
-      return std::move(t);
-    }
-  } else if (ivalue.isDevice()) {
-    return py::cast<py::object>(THPDevice_New(std::move(ivalue).toDevice()));
-  } else if (ivalue.isGenericDict()) {
-    auto dict = std::move(ivalue).toGenericDict();
-    py::dict py_dict;
-    for (auto& pair : dict) {
-      py_dict[toPyObject(IValue{pair.key()})] =
-          toPyObject(IValue{pair.value()});
-    }
-    return std::move(py_dict);
-  } else if (ivalue.isRRef()) {
-#ifdef USE_RPC
-    auto RRefPtr =
-        c10::dynamic_intrusive_pointer_cast<torch::distributed::rpc::RRef>(
-            std::move(ivalue).toRRef());
-    return py::cast(torch::distributed::rpc::PyRRef(RRefPtr));
-#else
-    AT_ERROR("RRef is only supported with the distributed package");
-#endif
-  } else if (ivalue.isObject()) {
-    const auto obj = std::move(ivalue).toObject();
-    if (obj->type()->is_module()) {
-      return py::cast(Module(obj));
-    }
-
-    auto pyCu = get_python_cu();
-    if (obj->name().find("__torch__.torch.classes") == 0) {
-      return py::cast(Object(obj));
-    }
-    const auto classType = pyCu->get_class(c10::QualifiedName(obj->name()));
-    AT_ASSERT(classType);
-    auto pyClass = getScriptedClassOrError(obj->type());
-    auto pyObj = pyClass.attr("__new__")(pyClass);
-
-    const auto numAttrs = classType->numAttributes();
-
-    for (const auto slot : c10::irange(numAttrs)) {
-      const auto& attrName = classType->getAttributeName(slot);
-      IValue v = obj->getSlot(slot);
-      py::setattr(pyObj, attrName.c_str(), toPyObject(std::move(v)));
-    }
-    return pyObj;
-  } else if (ivalue.isPyObject()) {
-    // return borrowed reference to ensure it correctly incref the underlying
-    // PyObject
-    return py::reinterpret_borrow<py::object>(ivalue.toPyObject());
-  } else if (ivalue.isCapsule()) {
-    return py::cast(c10::Capsule(ivalue.toCapsule()));
-  } else if (ivalue.isFuture()) {
-    return py::cast(std::make_shared<PythonFutureWrapper>(ivalue.toFuture()));
-  } else if (ivalue.isEnum()) {
-    auto enum_holder = ivalue.toEnumHolder();
-    auto py_class = getScriptedClassOrError(enum_holder->type());
-    return py_class.attr(enum_holder->name().c_str());
-  } else if (ivalue.isRRef()) {
-#ifdef USE_RPC
-    return py::cast(torch::distributed::rpc::PyRRef(
-        c10::static_intrusive_pointer_cast<distributed::rpc::RRef>(
-            ivalue.toRRef())));
-#else
-    TORCH_CHECK(false, "RRef is only supported with the distributed package");
-#endif
-  } else if (ivalue.isSymInt()) {
-    auto si = ivalue.toSymInt();
-    return py::cast(si);
-  } else {
-    AT_ERROR(
-        "Missing cases in 'toPyObject'! Can't convert ",
-        ivalue.tagKind(),
-        " to a Python object");
-  }
-}
-
 struct VISIBILITY_HIDDEN tuple_slice {
   /*implicit*/ tuple_slice(py::tuple tup_)
       : tup(std::move(tup_)), b(0), e(tup.size()) {}
@@ -1154,123 +988,24 @@ inline py::object invokeScriptMethodFromPython(
       });
 }
 
-inline std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
+TORCH_API std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
     const std::vector<std::shared_ptr<Operator>>& operations,
     py::args args,
-    const py::kwargs& kwargs) {
-  Stack stack;
-  if (operations.size() == 1) {
-    std::shared_ptr<Operator> op = operations.at(0);
-    // Create a stack full of the arguments and keyword arguments.
-    stack = createStackForSchema(
-        op->schema(), std::move(args), kwargs, c10::nullopt);
-
-    return std::make_pair(op, stack);
-  } else {
-    std::vector<schema_match_error> errors;
-    std::shared_ptr<Operator> found_op = nullptr;
-    for (const auto& op : operations) {
-      try {
-        stack = createStackForSchema(op->schema(), args, kwargs, c10::nullopt);
-        found_op = op;
-        break;
-      } catch (schema_match_error& error) {
-        errors.push_back(std::move(error));
-      }
-    }
-    if (!found_op) {
-      std::stringstream ss;
-      ss << "Overloaded torch operator invoked from Python failed to many any schema:\n";
-      for (const auto& err : errors) {
-        ss << err.what() << "\n\n";
-      }
-      throw std::runtime_error(ss.str());
-    }
+    const py::kwargs& kwargs);
 
-    return std::make_pair(found_op, stack);
-  }
-}
-inline py::object invokeOperatorFromPython(
+TORCH_API py::object invokeOperatorFromPython(
     const std::vector<std::shared_ptr<Operator>>& operations,
     py::args args,
     const py::kwargs& kwargs,
-    c10::optional<c10::DispatchKey> dk = c10::nullopt) {
-  auto opWithStack = getOpWithStack(operations, args, kwargs);
-  std::shared_ptr<Operator> found_op = std::get<0>(opWithStack);
-  Stack stack = std::get<1>(opWithStack);
-  {
-    pybind11::gil_scoped_release no_gil_guard;
-    if (dk) {
-      found_op->getOperationForDispatchKey (*dk)(stack);
-    } else {
-      found_op->getOperation()(stack);
-    }
-  }
-
-  return createPyObjectForStack(std::move(stack));
-}
+    c10::optional<c10::DispatchKey> dk = c10::nullopt);
 
-inline py::object _get_operation_for_overload_or_packet(
+TORCH_API py::object _get_operation_for_overload_or_packet(
     const std::vector<std::shared_ptr<Operator>>& operations,
     Symbol symbol,
     py::args args,
     const py::kwargs& kwargs,
     bool is_overload,
-    c10::optional<c10::DispatchKey> dk = c10::nullopt) {
-  std::vector<py::handle> overloaded_args;
-  size_t total_arg_num = args.size() + kwargs.size();
-  for (const auto i : c10::irange(args.size())) {
-    is_tensor_and_append_overloaded(args[i].ptr(), &overloaded_args);
-    is_tensor_list_and_append_overloaded(
-        args[i].ptr(),
-        &overloaded_args,
-        static_cast<int>(total_arg_num),
-        false /* throw_error */);
-  }
-  // NB: for kwargs, we cannot guarantee the order of appending
-  // is the same as the argument order in operator's schema.
-  // This is suboptimal, but should be fine. Later when we have
-  // better schema matching and argument parsing, we could
-  // match the operator in `operations` first, then the order will
-  // be guaranteed.
-  for (auto item : kwargs) {
-    is_tensor_and_append_overloaded(item.second.ptr(), &overloaded_args);
-    is_tensor_list_and_append_overloaded(
-        item.second.ptr(),
-        &overloaded_args,
-        total_arg_num,
-        false /* throw_error */);
-  }
-  if (overloaded_args.size() > 0 ||
-      at::impl::PythonTorchFunctionTLS::get_mode()) {
-    py::object ret;
-    std::string ns = symbol.ns().toUnqualString();
-    std::string method_name = symbol.toUnqualString();
-    auto self_func = py::module::import("torch")
-                         .attr("ops")
-                         .attr(ns.c_str())
-                         .attr(method_name.c_str());
-    if (is_overload) {
-      auto overload_name = operations[0]->schema().overload_name();
-      if (overload_name == "") {
-        self_func = self_func.attr("default");
-      } else {
-        self_func = self_func.attr(overload_name.c_str());
-      }
-    }
-    std::string module_name("torch.ops");
-    module_name.append(ns);
-    return pybind11::reinterpret_steal<py::object>(
-        handle_torch_function_no_python_arg_parser(
-            overloaded_args,
-            args.ptr(),
-            kwargs.ptr(),
-            method_name.c_str(),
-            self_func.ptr(),
-            module_name.c_str()));
-  }
-  return invokeOperatorFromPython(operations, args, kwargs, dk);
-}
+    c10::optional<c10::DispatchKey> dk = c10::nullopt);
 
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/decomposition_registry.cpp b/torch/csrc/jit/runtime/decomposition_registry.cpp
index d55ac7eac9be..05e5c9b6b196 100644
--- a/torch/csrc/jit/runtime/decomposition_registry.cpp
+++ b/torch/csrc/jit/runtime/decomposition_registry.cpp
@@ -8,6 +8,7 @@
 #include <torch/csrc/jit/serialization/import_source.h>
 
 #include <c10/util/Exception.h>
+#include <torch/csrc/autograd/jit_decomp_interface.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
 #include <torch/csrc/jit/passes/inliner.h>
@@ -160,6 +161,47 @@ void RegisterDecomposition(
   schema_to_decomposition[&schema] = g;
 }
 
+// see NOTE: [Jit Decomposition Interface]
+struct JitDecomp final : torch::autograd::impl::JitDecompInterface {
+  bool has_jit_decomposition(const c10::FunctionSchema& schema) const override;
+  void run_jit_decomposition(
+      const c10::OperatorHandle& op,
+      torch::jit::Stack* stack) const override;
+};
+
+JitDecomp jitDecomp;
+torch::autograd::impl::JitDecompRegisterer registerJitDecomp(&jitDecomp);
+
+void JitDecomp::run_jit_decomposition(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack) const {
+  ::torch::jit::run_jit_decomposition(op, stack);
+}
+
+bool JitDecomp::has_jit_decomposition(const FunctionSchema& schema) const {
+  return ::torch::jit::has_jit_decomposition(schema);
+}
+
+void run_jit_decomposition(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack) {
+  const auto& schema = op.schema();
+  // TODO: templatize based on op and keep static trace_exec
+  auto* trace_exec = torch::jit::GetDecompositionExecutor(schema);
+  trace_exec->run((*stack));
+  if (stack->back().isTuple()) {
+    at::IValue tup = stack->back();
+    stack->pop_back();
+    for (const auto& elem : tup.toTuple()->elements()) {
+      stack->push_back(elem);
+    }
+  }
+}
+
+bool has_jit_decomposition(const FunctionSchema& schema) {
+  return GetDecompositionFunction(schema).has_value();
+}
+
 Function* GetDecompositionExecutor(const FunctionSchema& schema) {
   auto maybe_func = GetDecompositionFunction(schema);
   TORCH_INTERNAL_ASSERT(maybe_func);
diff --git a/torch/csrc/jit/runtime/decomposition_registry.h b/torch/csrc/jit/runtime/decomposition_registry.h
index 4c6ef3029a0b..225204cf60de 100644
--- a/torch/csrc/jit/runtime/decomposition_registry.h
+++ b/torch/csrc/jit/runtime/decomposition_registry.h
@@ -25,5 +25,11 @@ TORCH_API Function* GetDecompositionExecutor(const char* schema_literal);
 
 TORCH_API Function* GetDecompositionExecutor(const FunctionSchema& schema);
 
+TORCH_API void run_jit_decomposition(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack);
+
+TORCH_API bool has_jit_decomposition(const FunctionSchema& schema);
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index 726362ee5bc9..dbf67443f595 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -420,6 +420,11 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
             "aten::sym_size.int(Tensor self, int dim) -> SymInt"),
         sym_size_int,
         aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::sym_stride.int(Tensor self, int dim) -> SymInt"),
+        sym_stride_int,
+        aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::stride(Tensor self) -> int[]"),
         [](Stack& stack) {
@@ -431,6 +436,11 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
         TORCH_SELECTIVE_SCHEMA("aten::sym_numel(Tensor self) -> SymInt"),
         sym_numel,
         aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::sym_storage_offset(Tensor self) -> SymInt"),
+        sym_storage_offset,
+        aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("aten::sym_stride(Tensor self) -> SymInt[]"),
         sym_stride,
diff --git a/torch/csrc/jit/runtime/symbolic_shape_registry_util.cpp b/torch/csrc/jit/runtime/symbolic_shape_registry_util.cpp
index 77824baf02ec..e008080f1d8f 100644
--- a/torch/csrc/jit/runtime/symbolic_shape_registry_util.cpp
+++ b/torch/csrc/jit/runtime/symbolic_shape_registry_util.cpp
@@ -75,6 +75,8 @@ const OperatorMap<std::string>& get_tensorexpr_elementwise_set() {
       {"aten::relu(Tensor self) -> Tensor", "unary"},
       {"aten::leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor", "unary"},
       {"aten::softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor", "unary"},
+      {"aten::mish(Tensor self) -> Tensor", "unary"},
+      {"aten::elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor", "unary"},
       {"aten::relu6(Tensor self) -> Tensor", "unary"},
       {"aten::gelu(Tensor self, *, str approximate='none') -> Tensor", "unary"},
       {"aten::silu(Tensor self) -> Tensor", "unary"},
diff --git a/torch/csrc/jit/serialization/export.cpp b/torch/csrc/jit/serialization/export.cpp
index 51efb307da8b..2f178addda95 100644
--- a/torch/csrc/jit/serialization/export.cpp
+++ b/torch/csrc/jit/serialization/export.cpp
@@ -426,7 +426,11 @@ onnx::TensorProto_DataType ATenTypeToOnnxType(at::ScalarType at_type) {
     case at::kBFloat16:
       return onnx::TensorProto_DataType_BFLOAT16;
     default:
-      AT_ERROR("unexpected tensor scalar type");
+      TORCH_CHECK(
+          false,
+          "ScalarType ",
+          toString(at_type),
+          " is an unexpected tensor scalar type");
   }
 }
 
diff --git a/torch/csrc/jit/tensorexpr/lowerings.cpp b/torch/csrc/jit/tensorexpr/lowerings.cpp
index e704cc689d27..de2d6e7cbdd8 100644
--- a/torch/csrc/jit/tensorexpr/lowerings.cpp
+++ b/torch/csrc/jit/tensorexpr/lowerings.cpp
@@ -1311,6 +1311,58 @@ int nnc_lowerings_lazy_registration() {
             });
       });
 
+  RegisterNNCLoweringsFunction aten_mish(
+      {"aten::mish(Tensor self) -> (Tensor)"},
+      [](const std::vector<ArgValue>& inputs,
+         const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
+         const c10::optional<ScalarType>& outputType,
+         at::Device device) {
+        return computeOneOperand(
+            "aten_mish",
+            inputs,
+            outputShape,
+            outputStrides,
+            outputType,
+            [](const ExprHandle& a) {
+              auto default_type_a = promoteIntegerToDefaultType(a);
+              return default_type_a * tanh(log1p(exp(default_type_a)));
+            });
+      });
+
+  RegisterNNCLoweringsFunction aten_elu(
+      {"aten::elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> (Tensor)"},
+      [](const std::vector<ArgValue>& inputs,
+         const std::vector<ExprHandle>& outputShape,
+         const std::vector<ExprHandle>& outputStrides,
+         const c10::optional<ScalarType>& outputType,
+         at::Device device) {
+        return computeFourOperand(
+            "aten_elu",
+            inputs,
+            outputShape,
+            outputStrides,
+            outputType,
+            [](const ExprHandle& a,
+               const ExprHandle& alpha,
+               const ExprHandle& scale,
+               const ExprHandle& input_scale) {
+              auto zero = Cast::make(a.dtype(), 0);
+              auto one = Cast::make(a.dtype(), 1);
+
+              auto poscoef = Cast::make(a.dtype(), scale);
+              auto negiptcoef = Cast::make(a.dtype(), input_scale);
+              auto negcoef = Cast::make(a.dtype(), alpha) * poscoef;
+
+              return CompareSelect::make(
+                  a,
+                  zero,
+                  a * poscoef,
+                  (exp(a * negiptcoef) - one) * negcoef,
+                  kGT);
+            });
+      });
+
   RegisterNNCLoweringsFunction aten_hardsigmoid(
       {"aten::hardsigmoid(Tensor self) -> (Tensor)"},
       [](const std::vector<ArgValue>& inputs,
diff --git a/torch/csrc/lazy/core/shape_inference.cpp b/torch/csrc/lazy/core/shape_inference.cpp
index 114c6a8e16b1..c598b82f7edd 100644
--- a/torch/csrc/lazy/core/shape_inference.cpp
+++ b/torch/csrc/lazy/core/shape_inference.cpp
@@ -1290,16 +1290,16 @@ std::vector<Shape> compute_shape_select_scatter(
     const at::Tensor& src,
     int64_t dim,
     int64_t index) {
-  auto self_meta = at::native::empty_strided_meta(
-      self.sizes(),
-      self.strides(),
+  auto self_meta = at::native::empty_strided_meta_symint(
+      self.sym_sizes(),
+      self.sym_strides(),
       /*dtype=*/c10::make_optional(self.scalar_type()),
       /*layout=*/c10::make_optional(self.layout()),
       /*device=*/c10::make_optional(c10::Device(c10::kMeta)),
       /*pin_memory=*/c10::nullopt);
-  auto src_meta = at::native::empty_strided_meta(
-      src.sizes(),
-      src.strides(),
+  auto src_meta = at::native::empty_strided_meta_symint(
+      src.sym_sizes(),
+      src.sym_strides(),
       /*dtype=*/c10::make_optional(src.scalar_type()),
       /*layout=*/c10::make_optional(src.layout()),
       /*device=*/c10::make_optional(c10::Device(c10::kMeta)),
@@ -1315,16 +1315,16 @@ std::vector<Shape> compute_shape_diagonal_scatter(
     int64_t offset,
     int64_t dim1,
     int64_t dim2) {
-  auto self_meta = at::native::empty_strided_meta(
-      self.sizes(),
-      self.strides(),
+  auto self_meta = at::native::empty_strided_meta_symint(
+      self.sym_sizes(),
+      self.sym_strides(),
       /*dtype=*/c10::make_optional(self.scalar_type()),
       /*layout=*/c10::make_optional(self.layout()),
       /*device=*/c10::make_optional(c10::Device(c10::kMeta)),
       /*pin_memory=*/c10::nullopt);
-  auto src_meta = at::native::empty_strided_meta(
-      src.sizes(),
-      src.strides(),
+  auto src_meta = at::native::empty_strided_meta_symint(
+      src.sym_sizes(),
+      src.sym_strides(),
       /*dtype=*/c10::make_optional(src.scalar_type()),
       /*layout=*/c10::make_optional(src.layout()),
       /*device=*/c10::make_optional(c10::Device(c10::kMeta)),
@@ -1341,16 +1341,16 @@ std::vector<Shape> compute_shape_slice_scatter(
     c10::optional<int64_t> start,
     c10::optional<int64_t> end,
     int64_t step) {
-  auto self_meta = at::native::empty_strided_meta(
-      self.sizes(),
-      self.strides(),
+  auto self_meta = at::native::empty_strided_meta_symint(
+      self.sym_sizes(),
+      self.sym_strides(),
       /*dtype=*/c10::make_optional(self.scalar_type()),
       /*layout=*/c10::make_optional(self.layout()),
       /*device=*/c10::make_optional(c10::Device(c10::kMeta)),
       /*pin_memory=*/c10::nullopt);
-  auto src_meta = at::native::empty_strided_meta(
-      src.sizes(),
-      src.strides(),
+  auto src_meta = at::native::empty_strided_meta_symint(
+      src.sym_sizes(),
+      src.sym_strides(),
       /*dtype=*/c10::make_optional(src.scalar_type()),
       /*layout=*/c10::make_optional(src.layout()),
       /*device=*/c10::make_optional(c10::Device(c10::kMeta)),
@@ -1366,16 +1366,16 @@ std::vector<Shape> compute_shape_as_strided_scatter(
     at::IntArrayRef size,
     at::IntArrayRef stride,
     c10::optional<int64_t> storage_offset) {
-  auto self_meta = at::native::empty_strided_meta(
-      self.sizes(),
-      self.strides(),
+  auto self_meta = at::native::empty_strided_meta_symint(
+      self.sym_sizes(),
+      self.sym_strides(),
       /*dtype=*/c10::make_optional(self.scalar_type()),
       /*layout=*/c10::make_optional(self.layout()),
       /*device=*/c10::make_optional(c10::Device(c10::kMeta)),
       /*pin_memory=*/c10::nullopt);
-  auto src_meta = at::native::empty_strided_meta(
-      src.sizes(),
-      src.strides(),
+  auto src_meta = at::native::empty_strided_meta_symint(
+      src.sym_sizes(),
+      src.sym_strides(),
       /*dtype=*/c10::make_optional(src.scalar_type()),
       /*layout=*/c10::make_optional(src.layout()),
       /*device=*/c10::make_optional(c10::Device(c10::kMeta)),
diff --git a/torch/csrc/lazy/core/tensor_impl.cpp b/torch/csrc/lazy/core/tensor_impl.cpp
index 72d24f7de53a..dae7d74a7b35 100644
--- a/torch/csrc/lazy/core/tensor_impl.cpp
+++ b/torch/csrc/lazy/core/tensor_impl.cpp
@@ -88,7 +88,7 @@ LTCTensorImpl::LTCTensorImpl(LazyTensor&& tensor)
   // This is a temporary fix for a PyTorch core issue,
   // according to https://github.com/pytorch/xla/pull/2682.
   is_non_overlapping_and_dense_ = false;
-  set_sizes_strides_policy(SizesStridesPolicy::CustomSizes);
+  set_custom_sizes_strides(SizesStridesPolicy::CustomSizes);
 }
 
 void LTCTensorImpl::set_tensor(const LazyTensorPtr& lazy_tensor) {
@@ -134,34 +134,12 @@ void LTCTensorImpl::shallow_copy_from(
 }
 
 c10::SymIntArrayRef LTCTensorImpl::sym_strides_custom() const {
-  return sym_strides_default();
-}
-
-void LTCTensorImpl::setup_sym_sizes() const {
-  auto rank = tensor_->shape().Get().sizes().size();
-  std::vector<c10::SymInt> sym_sizes;
-  sym_sizes.reserve(rank);
-  for (auto i : c10::irange(rank)) {
-    auto dim_node = getBackend()->GetIrBuilder()->MakeSizeNode(
-        this->tensor_->GetIrValue(), i);
-    auto sn = c10::make_intrusive<torch::lazy::SymIntNodeImpl>(dim_node);
-    sym_sizes.push_back(sn->toSymInt());
-  }
-
-  sym_sizes_ = sym_sizes;
+  return c10::fromIntArrayRef(strides_custom());
+  ;
 }
 
 c10::SymIntArrayRef LTCTensorImpl::sym_sizes_custom() const {
-  if (FLAGS_ltc_enable_symbolic_shapes) {
-    setup_sym_sizes();
-    return c10::SymIntArrayRef(sym_sizes_->data(), sym_sizes_->size());
-  }
-
-  return c10::SymIntArrayRef::fromIntArrayRef(sizes_custom());
-}
-
-c10::SymIntArrayRef LTCTensorImpl::sym_sizes() const {
-  return sym_sizes_custom();
+  return c10::fromIntArrayRef(sizes_custom());
 }
 
 void LTCTensorImpl::setup_size_properties() {
diff --git a/torch/csrc/lazy/core/tensor_impl.h b/torch/csrc/lazy/core/tensor_impl.h
index 1240665bf601..848f2cef9cb4 100644
--- a/torch/csrc/lazy/core/tensor_impl.h
+++ b/torch/csrc/lazy/core/tensor_impl.h
@@ -44,7 +44,6 @@ class TORCH_API LTCTensorImpl final : public c10::TensorImpl {
   bool is_contiguous_custom(at::MemoryFormat memory_format) const override;
 
   virtual c10::SymIntArrayRef sym_sizes_custom() const override;
-  virtual c10::SymIntArrayRef sym_sizes() const override;
   virtual c10::SymIntArrayRef sym_strides_custom() const override;
 
 #ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
@@ -58,7 +57,6 @@ class TORCH_API LTCTensorImpl final : public c10::TensorImpl {
 
  private:
   void setup_size_properties();
-  void setup_sym_sizes() const;
 
   LazyTensorPtr tensor_;
   mutable c10::optional<std::vector<c10::SymInt>> sym_sizes_;
diff --git a/torch/csrc/lazy/ts_backend/ts_native_functions.cpp b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
index 5787ebc62a4c..610e8957afed 100644
--- a/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
@@ -269,7 +269,7 @@ at::Tensor LazyNativeFunctions::_to_copy(
   }
 };
 
-at::Tensor LazyNativeFunctions::empty(
+at::Tensor LazyNativeFunctions::empty_symint(
     at::SymIntArrayRef sym_size,
     c10::optional<at::ScalarType> dtype,
     c10::optional<at::Layout> layout,
@@ -299,21 +299,18 @@ at::Tensor LazyNativeFunctions::empty(
   }
 }
 
-at::Tensor LazyNativeFunctions::empty_strided(
-    at::IntArrayRef size,
-    at::IntArrayRef stride,
+at::Tensor LazyNativeFunctions::empty_strided_symint(
+    at::SymIntArrayRef sym_size,
+    at::SymIntArrayRef sym_stride,
     c10::optional<at::ScalarType> dtype,
     c10::optional<at::Layout> layout,
     c10::optional<at::Device> device,
     c10::optional<bool> pin_memory) {
   TORCH_LAZY_FN_COUNTER("lazy::");
-  at::Tensor t = empty(
-      c10::SymIntArrayRef::fromIntArrayRef(size),
-      dtype,
-      layout,
-      device,
-      pin_memory,
-      c10::nullopt);
+  at::Tensor t =
+      empty_symint(sym_size, dtype, layout, device, pin_memory, c10::nullopt);
+  auto size = c10::asIntArrayRefSlow(sym_size);
+  auto stride = c10::asIntArrayRefSlow(sym_stride);
   return t.as_strided(size, stride, /*storage_offset=*/0);
 }
 
@@ -409,8 +406,8 @@ at::Tensor LazyNativeFunctions::_unsafe_view(
     const at::Tensor& self,
     at::IntArrayRef size) {
   TORCH_LAZY_FN_COUNTER("lazy::");
-  return LazyNativeFunctions::view_copy(
-      self, c10::SymIntArrayRef::fromIntArrayRef(size));
+  return LazyNativeFunctions::view_copy_symint(
+      self, c10::fromIntArrayRef(size));
 }
 
 // This is needed by the torch.tensor constructor.
@@ -436,10 +433,10 @@ at::Tensor LazyNativeFunctions::block_diag(at::TensorList tensors) {
   return at::functionalization::functionalize_aten_op<ATEN_OP(
       block_diag)>::call(tensors);
 }
-at::Tensor LazyNativeFunctions::new_empty_strided(
+at::Tensor LazyNativeFunctions::new_empty_strided_symint(
     const at::Tensor& self,
-    at::IntArrayRef size,
-    at::IntArrayRef stride,
+    c10::SymIntArrayRef size,
+    c10::SymIntArrayRef stride,
     c10::optional<at::ScalarType> dtype,
     c10::optional<at::Layout> layout,
     c10::optional<at::Device> device,
@@ -449,7 +446,7 @@ at::Tensor LazyNativeFunctions::new_empty_strided(
           self, size, stride, dtype, layout, device, pin_memory);
 }
 
-at::Tensor LazyNativeFunctions::narrow_copy(
+at::Tensor LazyNativeFunctions::narrow_copy_symint(
     const at::Tensor& self,
     int64_t dim,
     c10::SymInt start,
diff --git a/torch/csrc/profiler/api.cpp b/torch/csrc/profiler/api.cpp
index 33a7d5cfddae..61a285919efd 100644
--- a/torch/csrc/profiler/api.cpp
+++ b/torch/csrc/profiler/api.cpp
@@ -6,111 +6,6 @@ namespace torch {
 namespace profiler {
 namespace impl {
 
-ExperimentalConfig::ExperimentalConfig(
-    std::vector<std::string> profiler_metrics,
-    bool profiler_measure_per_kernel)
-    : profiler_metrics{profiler_metrics},
-      profiler_measure_per_kernel{profiler_measure_per_kernel} {}
-
-/*explicit*/ ExperimentalConfig::operator bool() const {
-  return !profiler_metrics.empty();
-}
-
-bool ProfilerConfig::disabled() const {
-  return state == torch::profiler::impl::ProfilerState::Disabled;
-}
-
-bool ProfilerConfig::global() const {
-  return state == torch::profiler::impl::ProfilerState::KINETO_ONDEMAND;
-}
-
-namespace {
-enum ProfilerIValueIdx {
-  STATE = 0,
-  REPORT_INPUT_SHAPES,
-  PROFILE_MEMORY,
-  NUM_PROFILER_CFG_IVALUE_IDX // must be last in list
-};
-} // namespace
-
-at::IValue ProfilerConfig::toIValue() const {
-  c10::impl::GenericList eventIValueList(at::AnyType::get());
-  eventIValueList.reserve(NUM_PROFILER_CFG_IVALUE_IDX);
-  eventIValueList.emplace_back(static_cast<int64_t>(state));
-  eventIValueList.emplace_back(report_input_shapes);
-  eventIValueList.emplace_back(profile_memory);
-  return eventIValueList;
-}
-
-ProfilerConfig ProfilerConfig::fromIValue(
-    const at::IValue& profilerConfigIValue) {
-  TORCH_INTERNAL_ASSERT(
-      profilerConfigIValue.isList(),
-      "Expected IValue to contain type c10::impl::GenericList");
-  auto ivalues = profilerConfigIValue.toList();
-  TORCH_INTERNAL_ASSERT(
-      ivalues.size() == NUM_PROFILER_CFG_IVALUE_IDX,
-      c10::str(
-          "Expected exactly ",
-          NUM_PROFILER_CFG_IVALUE_IDX,
-          " ivalues to resconstruct ProfilerConfig."));
-  return ProfilerConfig(
-      static_cast<ProfilerState>(ivalues.get(ProfilerIValueIdx::STATE).toInt()),
-      ivalues.get(ProfilerIValueIdx::REPORT_INPUT_SHAPES).toBool(),
-      ivalues.get(ProfilerIValueIdx::PROFILE_MEMORY).toBool());
-}
-
-/*explicit*/ ProfilerThreadLocalStateBase::ProfilerThreadLocalStateBase(
-    const ProfilerConfig& config)
-    : c10::MemoryReportingInfoBase(), config_(config) {}
-
-ProfilerThreadLocalStateBase::~ProfilerThreadLocalStateBase() {
-  if (handle_) {
-    auto handle = handle_;
-    removeCallback();
-    SOFT_ASSERT(false, "Leaked callback handle: ", handle);
-  }
-}
-
-void ProfilerThreadLocalStateBase::setCallbackHandle(
-    at::CallbackHandle handle) {
-  if (handle_) {
-    at::removeCallback(handle_);
-    SOFT_ASSERT(
-        false,
-        "ProfilerStateBase already has a registered callback. "
-        "Removing to avoid leaked callback.");
-  }
-
-  handle_ = handle;
-}
-
-void ProfilerThreadLocalStateBase::removeCallback() {
-  if (handle_) {
-    at::removeCallback(handle_);
-    handle_ = 0;
-  }
-}
-
-bool profilerEnabled() {
-  auto state_ptr = ProfilerThreadLocalStateBase::getTLS();
-  return state_ptr && !state_ptr->config().disabled();
-}
-
-TORCH_API ActiveProfilerType profilerType() {
-  auto state_ptr = ProfilerThreadLocalStateBase::getTLS();
-  return state_ptr == nullptr ? ActiveProfilerType::NONE
-                              : state_ptr->profilerType();
-}
-
-torch::profiler::impl::ProfilerConfig getProfilerConfig() {
-  auto state_ptr = ProfilerThreadLocalStateBase::getTLS();
-  TORCH_CHECK(
-      state_ptr,
-      "Tried to access profiler config, but profiler is not enabled!");
-  return state_ptr->config();
-}
-
 ProfilerStubs::~ProfilerStubs() = default;
 
 namespace {
diff --git a/torch/csrc/profiler/api.h b/torch/csrc/profiler/api.h
index b19c50b7bd0f..0414b8c3238c 100644
--- a/torch/csrc/profiler/api.h
+++ b/torch/csrc/profiler/api.h
@@ -2,6 +2,7 @@
 
 #include <ATen/record_function.h>
 #include <torch/csrc/Export.h>
+#include <torch/csrc/profiler/orchestration/observer.h>
 
 struct CUevent_st;
 
@@ -9,123 +10,6 @@ namespace torch {
 namespace profiler {
 namespace impl {
 
-// ----------------------------------------------------------------------------
-// -- Profiler Config ---------------------------------------------------------
-// ----------------------------------------------------------------------------
-enum class C10_API_ENUM ActivityType {
-  CPU = 0,
-  CUDA, // CUDA kernels, runtime
-  NUM_KINETO_ACTIVITIES, // must be the last one
-};
-
-enum class C10_API_ENUM ProfilerState {
-  Disabled = 0,
-  CPU, // CPU-only profiling
-  CUDA, // CPU + CUDA events
-  NVTX, // only emit NVTX markers
-  ITT, // only emit ITT markers
-  KINETO, // use libkineto
-  KINETO_GPU_FALLBACK, // use CUDA events when CUPTI is not available
-  KINETO_ONDEMAND, // run the profiler in on-demand mode
-  NUM_PROFILER_STATES, // must be the last one
-};
-
-enum class C10_API_ENUM ActiveProfilerType {
-  NONE = 0,
-  LEGACY,
-  KINETO,
-  NVTX,
-  ITT
-};
-
-struct TORCH_API ExperimentalConfig {
-  ExperimentalConfig(
-      std::vector<std::string> profiler_metrics = {},
-      bool profiler_measure_per_kernel = false);
-  ~ExperimentalConfig() = default;
-  explicit operator bool() const;
-
-  std::vector<std::string> profiler_metrics;
-  bool profiler_measure_per_kernel;
-};
-
-struct TORCH_API ProfilerConfig {
-  explicit ProfilerConfig(
-      ProfilerState state,
-      bool report_input_shapes = false,
-      bool profile_memory = false,
-      bool with_stack = false,
-      bool with_flops = false,
-      bool with_modules = false,
-      ExperimentalConfig experimental_config = ExperimentalConfig())
-      : state(state),
-        experimental_config(experimental_config),
-        report_input_shapes(report_input_shapes),
-        profile_memory(profile_memory),
-        with_stack(with_stack),
-        with_flops(with_flops),
-        with_modules(with_modules) {}
-  ~ProfilerConfig() = default;
-
-  bool disabled() const;
-  bool global() const;
-
-  ProfilerState state;
-  ExperimentalConfig experimental_config;
-  bool report_input_shapes;
-  bool profile_memory;
-  bool with_stack;
-  bool with_flops;
-  bool with_modules;
-
-  // Returns IValues corresponding to ProfilerConfig struct, to be used for
-  // serialization.
-  at::IValue toIValue() const;
-
-  // Reconstructs a ProfilerConfig from IValues given by toIValue.
-  static ProfilerConfig fromIValue(const at::IValue& profilerConfigIValue);
-};
-
-struct TORCH_API ProfilerThreadLocalStateBase
-    : public c10::MemoryReportingInfoBase {
-  explicit ProfilerThreadLocalStateBase(const ProfilerConfig& config);
-  ~ProfilerThreadLocalStateBase() override;
-
-  static ProfilerThreadLocalStateBase* getTLS() {
-    return static_cast<ProfilerThreadLocalStateBase*>(
-        c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PROFILER_STATE));
-  }
-
-  const ProfilerConfig& config() const {
-    return config_;
-  }
-
-  void setCallbackHandle(at::CallbackHandle handle);
-  void removeCallback();
-
-  bool memoryProfilingEnabled() const override {
-    return config_.profile_memory;
-  }
-
-  virtual ActiveProfilerType profilerType() = 0;
-
- protected:
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  std::mutex state_mutex_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled);
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  at::CallbackHandle handle_ = 0;
-};
-
-// Returns if the profiler is currently enabled in the current thread.
-TORCH_API bool profilerEnabled();
-
-TORCH_API ActiveProfilerType profilerType();
-
-// Retrieve the thread_local ProfilerConfig.
-TORCH_API ProfilerConfig getProfilerConfig();
-
 // ----------------------------------------------------------------------------
 // -- Annotation --------------------------------------------------------------
 // ----------------------------------------------------------------------------
diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
index be8b2135cae4..a8f879ac51e4 100644
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@@ -341,43 +341,7 @@ struct SubQueueThreadCache {
 // `sub_queue_cache_` and fall back to a different mechanism.
 std::atomic<uint32_t> queue_id_{0};
 thread_local SubQueueThreadCache sub_queue_cache_{0, nullptr};
-} // namespace
-
-namespace python_tracer {
-namespace {
-GetFn get_fn;
-
-struct NoOpPythonTracer : public PythonTracerBase {
-  static NoOpPythonTracer& singleton() {
-    static NoOpPythonTracer singleton_;
-    return singleton_;
-  }
-  void start(RecordQueue*) override {}
-  void stop() override {}
-  void clear() override {}
-  std::vector<std::shared_ptr<Result>> getEvents(
-      std::function<time_t(approx_time_t)>,
-      std::vector<CompressedEvent>&,
-      time_t) override {
-    return {};
-  }
-  ~NoOpPythonTracer() = default;
-};
-} // namespace
-
-void registerTracer(GetFn get_tracer) {
-  get_fn = get_tracer;
-}
 
-PythonTracerBase& PythonTracerBase::get() {
-  if (get_fn == nullptr) {
-    return NoOpPythonTracer::singleton();
-  }
-  return get_fn();
-}
-} // namespace python_tracer
-
-namespace {
 std::string toString(const ExtraFields<EventType::PyCall>& e) {
   if (e.module_.has_value()) {
     return fmt::format(
@@ -499,7 +463,7 @@ RecordQueue::RecordQueue(
     std::set<ActivityType> activities)
     : id_(++queue_id_), config_{config}, activities_{activities} {
   if (tracePython()) {
-    python_tracer::PythonTracerBase::get().start(this);
+    python_tracer_ = python_tracer::PythonTracerBase::make(this);
   }
 }
 
@@ -534,8 +498,8 @@ ThreadLocalSubqueue* RecordQueue::getSubqueue() {
 }
 
 void RecordQueue::stop() {
-  if (tracePython()) {
-    python_tracer::PythonTracerBase::get().stop();
+  if (python_tracer_) {
+    python_tracer_->stop();
   }
 }
 
@@ -993,13 +957,12 @@ RecordQueue::getRecords(
     }
   }
 
-  if (tracePython()) {
-    auto& tracer = python_tracer::PythonTracerBase::get();
-    for (auto i :
-         tracer.getEvents(converter, python_enters, end_time_us * 1000)) {
+  if (python_tracer_) {
+    for (auto i : python_tracer_->getEvents(
+             converter, python_enters, end_time_us * 1000)) {
       out.push_back(i);
     }
-    tracer.clear();
+    python_tracer_.reset();
   }
 
   auto trace = addKinetoEvents(out, start_time_us, end_time_us, config_);
diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h
index 1a6966c38016..3ce8f957c500 100644
--- a/torch/csrc/profiler/collection.h
+++ b/torch/csrc/profiler/collection.h
@@ -14,6 +14,7 @@
 #include <c10/util/variant.h>
 #include <torch/csrc/profiler/containers.h>
 #include <torch/csrc/profiler/kineto_shim.h>
+#include <torch/csrc/profiler/orchestration/python_tracer.h>
 #include <torch/csrc/profiler/util.h>
 #include <torch/csrc/utils/python_stub.h>
 
@@ -364,51 +365,6 @@ class InputOutputEncoder final {
   AppendOnlyList<c10::IValue, IO_ENCODER_DEFAULT_BLOCK_SIZE> ivalues_;
 };
 
-class RecordQueue;
-namespace python_tracer {
-/*
-Libtorch does not depend on Python (e.g. cannot #include <Python.h>); however
-when we call the profiler from libtorch_python we need the profiler to be able
-to ingest the data that we collect from the Python tracer. (`PyEval_SetProfile`)
-
-In order to solve this dependency issue we define a virtual base and a function
-to register a getter. The python tracer then implements these functions and
-exposes itself by calling `registerTracer` from `torch/csrc/autograd/init.cpp`.
-This pattern of registration for faux python dependencies in libtorch is common
-in the PyTorch codebase.
-*/
-
-using TraceKey = strong::type<
-    uint64_t,
-    struct TraceKey_,
-    strong::regular,
-    strong::hashable,
-    strong::ostreamable>;
-
-struct CompressedEvent {
-  TraceKey key_;
-  uint64_t system_tid_;
-  kineto::DeviceAndResource kineto_info_;
-  time_t enter_t_;
-};
-
-struct TORCH_API PythonTracerBase {
-  static PythonTracerBase& get();
-  virtual ~PythonTracerBase() = default;
-
-  virtual void start(RecordQueue* queue) = 0;
-  virtual void stop() = 0;
-  virtual std::vector<std::shared_ptr<Result>> getEvents(
-      std::function<time_t(approx_time_t)> time_converter,
-      std::vector<CompressedEvent>& enters,
-      time_t end_time_ns) = 0;
-  virtual void clear() = 0;
-};
-
-using GetFn = PythonTracerBase& (*)();
-TORCH_API void registerTracer(GetFn get_tracer);
-} // namespace python_tracer
-
 class TORCH_API ThreadLocalSubqueue {
  public:
   ThreadLocalSubqueue(const uint64_t tid, const ProfilerConfig& config);
@@ -532,6 +488,7 @@ class TORCH_API RecordQueue {
   ska::flat_hash_map<uint64_t, std::unique_ptr<ThreadLocalSubqueue>>
       sub_queues_;
   std::mutex sub_queue_mutex_;
+  std::unique_ptr<python_tracer::PythonTracerBase> python_tracer_;
 };
 
 } // namespace impl
diff --git a/torch/csrc/profiler/execution_graph_observer.cpp b/torch/csrc/profiler/execution_graph_observer.cpp
index 44727b1bee3a..f1846e180a34 100644
--- a/torch/csrc/profiler/execution_graph_observer.cpp
+++ b/torch/csrc/profiler/execution_graph_observer.cpp
@@ -623,7 +623,7 @@ void onFunctionExit(const RecordFunction& fn, ObserverContext* ctx_ptr) {
 bool addExecutionGraphObserver(const std::string& output_file_path) {
   // Check if the observer is already initialized.
   if (ObserverManager::get() == nullptr) {
-    ObserverManager::init();
+    ObserverManager::push(std::make_shared<ExecutionGraphObserver>());
     auto& ob = *ObserverManager::get();
     ob.pid = processId();
     // Set output
@@ -660,7 +660,9 @@ void removeExecutionGraphObserver() {
       removeCallback(ob->cb_handle);
       ob->cb_handle = INVALID_CALLBACK_HANDLE;
       // Release the current EG observer object and reset.
-      ObserverManager::pop();
+      TORCH_INTERNAL_ASSERT(
+          ObserverManager::pop() != nullptr,
+          "Global state ptr cannot be null before resetting");
       VLOG(1) << "Removed PyTorch execution graph observer";
     } else {
       LOG(WARNING) << "Execution graph observer was not registered.";
diff --git a/torch/csrc/profiler/itt_observer.cpp b/torch/csrc/profiler/itt_observer.cpp
index 3c044dcf1073..f42c285d8447 100644
--- a/torch/csrc/profiler/itt_observer.cpp
+++ b/torch/csrc/profiler/itt_observer.cpp
@@ -6,9 +6,9 @@ namespace torch {
 namespace profiler {
 namespace impl {
 
-struct ITTThreadLocalState : ProfilerThreadLocalStateBase {
+struct ITTThreadLocalState : ProfilerStateBase {
   explicit ITTThreadLocalState(const ProfilerConfig& config)
-      : ProfilerThreadLocalStateBase(config) {
+      : ProfilerStateBase(config) {
     // Only `report_input_shapes` makes sense in this context.
     TORCH_CHECK(!config.profile_memory);
     TORCH_CHECK(!config.with_stack);
@@ -25,7 +25,7 @@ struct ITTThreadLocalState : ProfilerThreadLocalStateBase {
       override {}
 
   static ITTThreadLocalState* getTLS() {
-    auto tls = ProfilerThreadLocalStateBase::getTLS();
+    auto tls = ProfilerStateBase::get(/*global=*/false);
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
         tls == nullptr || tls->profilerType() == ActiveProfilerType::ITT);
     return static_cast<ITTThreadLocalState*>(tls);
diff --git a/torch/csrc/profiler/kineto_client_interface.cpp b/torch/csrc/profiler/kineto_client_interface.cpp
index aca7fbd91055..699752668e22 100644
--- a/torch/csrc/profiler/kineto_client_interface.cpp
+++ b/torch/csrc/profiler/kineto_client_interface.cpp
@@ -1,6 +1,8 @@
 #ifdef USE_KINETO
 #include <libkineto.h>
+
 #include <torch/csrc/autograd/profiler_kineto.h>
+#include <torch/csrc/profiler/orchestration/observer.h>
 
 namespace torch {
 namespace profiler {
@@ -45,7 +47,9 @@ class LibKinetoClient : public libkineto::ClientInterface {
   }
 
   void stop() override {
-    (void)disableProfiler();
+    if (ProfilerStateBase::pop()) {
+      TORCH_WARN("LibKinetoClient preempted another profiler.");
+    }
   }
 
  private:
diff --git a/torch/csrc/profiler/nvtx_observer.cpp b/torch/csrc/profiler/nvtx_observer.cpp
index fa091c4ef8b7..5c04fed12410 100644
--- a/torch/csrc/profiler/nvtx_observer.cpp
+++ b/torch/csrc/profiler/nvtx_observer.cpp
@@ -6,9 +6,9 @@ namespace torch {
 namespace profiler {
 namespace impl {
 
-struct NVTXThreadLocalState : ProfilerThreadLocalStateBase {
+struct NVTXThreadLocalState : ProfilerStateBase {
   explicit NVTXThreadLocalState(const ProfilerConfig& config)
-      : ProfilerThreadLocalStateBase(config) {
+      : ProfilerStateBase(config) {
     // Only `report_input_shapes` makes sense in this context.
     TORCH_CHECK(!config.profile_memory);
     TORCH_CHECK(!config.with_stack);
@@ -25,7 +25,7 @@ struct NVTXThreadLocalState : ProfilerThreadLocalStateBase {
       override {}
 
   static NVTXThreadLocalState* getTLS() {
-    auto tls = ProfilerThreadLocalStateBase::getTLS();
+    auto tls = ProfilerStateBase::get(/*global=*/false);
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
         tls == nullptr || tls->profilerType() == ActiveProfilerType::NVTX);
     return static_cast<NVTXThreadLocalState*>(tls);
diff --git a/torch/csrc/profiler/orchestration/observer.cpp b/torch/csrc/profiler/orchestration/observer.cpp
new file mode 100644
index 000000000000..094121c38ec2
--- /dev/null
+++ b/torch/csrc/profiler/orchestration/observer.cpp
@@ -0,0 +1,179 @@
+#include <torch/csrc/profiler/orchestration/observer.h>
+
+#include <torch/csrc/profiler/util.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+using GlobalManager = GlobalStateManager<ProfilerStateBase>;
+
+// ----------------------------------------------------------------------------
+// -- Profiler Config ---------------------------------------------------------
+// ----------------------------------------------------------------------------
+ExperimentalConfig::ExperimentalConfig(
+    std::vector<std::string> profiler_metrics,
+    bool profiler_measure_per_kernel,
+    bool verbose)
+    : profiler_metrics{profiler_metrics},
+      profiler_measure_per_kernel{profiler_measure_per_kernel},
+      verbose{verbose} {}
+
+/*explicit*/ ExperimentalConfig::operator bool() const {
+  return !profiler_metrics.empty();
+}
+
+ProfilerConfig::ProfilerConfig(
+    ProfilerState state,
+    bool report_input_shapes,
+    bool profile_memory,
+    bool with_stack,
+    bool with_flops,
+    bool with_modules,
+    ExperimentalConfig experimental_config)
+    : state{state},
+      experimental_config{experimental_config},
+      report_input_shapes{report_input_shapes},
+      profile_memory{profile_memory},
+      with_stack{with_stack},
+      with_flops{with_flops},
+      with_modules{with_modules} {}
+
+bool ProfilerConfig::disabled() const {
+  return state == torch::profiler::impl::ProfilerState::Disabled;
+}
+
+bool ProfilerConfig::global() const {
+  return state == torch::profiler::impl::ProfilerState::KINETO_ONDEMAND;
+}
+
+namespace {
+enum ProfilerIValueIdx {
+  STATE = 0,
+  REPORT_INPUT_SHAPES,
+  PROFILE_MEMORY,
+  NUM_PROFILER_CFG_IVALUE_IDX // must be last in list
+};
+} // namespace
+
+at::IValue ProfilerConfig::toIValue() const {
+  c10::impl::GenericList eventIValueList(at::AnyType::get());
+  eventIValueList.reserve(NUM_PROFILER_CFG_IVALUE_IDX);
+  eventIValueList.emplace_back(static_cast<int64_t>(state));
+  eventIValueList.emplace_back(report_input_shapes);
+  eventIValueList.emplace_back(profile_memory);
+  return eventIValueList;
+}
+
+ProfilerConfig ProfilerConfig::fromIValue(
+    const at::IValue& profilerConfigIValue) {
+  TORCH_INTERNAL_ASSERT(
+      profilerConfigIValue.isList(),
+      "Expected IValue to contain type c10::impl::GenericList");
+  auto ivalues = profilerConfigIValue.toList();
+  TORCH_INTERNAL_ASSERT(
+      ivalues.size() == NUM_PROFILER_CFG_IVALUE_IDX,
+      c10::str(
+          "Expected exactly ",
+          NUM_PROFILER_CFG_IVALUE_IDX,
+          " ivalues to resconstruct ProfilerConfig."));
+  return ProfilerConfig(
+      static_cast<ProfilerState>(ivalues.get(ProfilerIValueIdx::STATE).toInt()),
+      ivalues.get(ProfilerIValueIdx::REPORT_INPUT_SHAPES).toBool(),
+      ivalues.get(ProfilerIValueIdx::PROFILE_MEMORY).toBool());
+}
+
+// ----------------------------------------------------------------------------
+// -- Profiler base class -----------------------------------------------------
+// ----------------------------------------------------------------------------
+/*explicit*/ ProfilerStateBase::ProfilerStateBase(const ProfilerConfig& config)
+    : c10::MemoryReportingInfoBase(), config_(config) {}
+
+ProfilerStateBase::~ProfilerStateBase() {
+  if (handle_) {
+    auto handle = handle_;
+    removeCallback();
+    SOFT_ASSERT(false, "Leaked callback handle: ", handle);
+  }
+}
+
+/*static*/ ProfilerStateBase* ProfilerStateBase::get(bool global) {
+  auto* out = global
+      ? GlobalManager::get()
+      : static_cast<ProfilerStateBase*>(
+            c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PROFILER_STATE));
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!out || out->config().global() == global);
+  return out;
+}
+
+/*static*/ void ProfilerStateBase::push(
+    std::shared_ptr<ProfilerStateBase>&& state) {
+  TORCH_INTERNAL_ASSERT(state != nullptr);
+  if (state->config().global()) {
+    GlobalManager::push(std::move(state));
+  } else {
+    c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state);
+  }
+}
+
+namespace {
+std::shared_ptr<ProfilerStateBase> popTLS() {
+  // If there is no active thread local profiler then we simply return null.
+  // However if there is an active profiler but it is not the top
+  // `DebugInfoBase`then `c10::ThreadLocalDebugInfo::_pop` will throw.
+  // TODO(robieta): make `noexcept` version.
+  return c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PROFILER_STATE)
+      ? std::static_pointer_cast<ProfilerStateBase>(
+            c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE))
+      : nullptr;
+}
+} // namespace
+
+/*static*/ std::shared_ptr<ProfilerStateBase> ProfilerStateBase::pop(
+    bool global) {
+  auto out = global ? GlobalManager::pop() : popTLS();
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!out || out->config().global() == global);
+  return out;
+}
+
+void ProfilerStateBase::setCallbackHandle(at::CallbackHandle handle) {
+  if (handle_) {
+    at::removeCallback(handle_);
+    SOFT_ASSERT(
+        false,
+        "ProfilerStateBase already has a registered callback. "
+        "Removing to avoid leaked callback.");
+  }
+
+  handle_ = handle;
+}
+
+void ProfilerStateBase::removeCallback() {
+  if (handle_) {
+    at::removeCallback(handle_);
+    handle_ = 0;
+  }
+}
+
+bool profilerEnabled() {
+  auto* state_ptr = ProfilerStateBase::get(/*global=*/false);
+  return state_ptr && !state_ptr->config().disabled();
+}
+
+TORCH_API ActiveProfilerType profilerType() {
+  auto* state_ptr = ProfilerStateBase::get(/*global=*/false);
+  return state_ptr == nullptr ? ActiveProfilerType::NONE
+                              : state_ptr->profilerType();
+}
+
+torch::profiler::impl::ProfilerConfig getProfilerConfig() {
+  auto* state_ptr = ProfilerStateBase::get(/*global=*/false);
+  TORCH_CHECK(
+      state_ptr,
+      "Tried to access profiler config, but profiler is not enabled!");
+  return state_ptr->config();
+}
+
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/torch/csrc/profiler/orchestration/observer.h b/torch/csrc/profiler/orchestration/observer.h
new file mode 100644
index 000000000000..3b75a91804bd
--- /dev/null
+++ b/torch/csrc/profiler/orchestration/observer.h
@@ -0,0 +1,129 @@
+#pragma once
+
+#include <ATen/record_function.h>
+#include <torch/csrc/Export.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+// ----------------------------------------------------------------------------
+// -- Profiler Config ---------------------------------------------------------
+// ----------------------------------------------------------------------------
+enum class C10_API_ENUM ActivityType {
+  CPU = 0,
+  CUDA, // CUDA kernels, runtime
+  NUM_KINETO_ACTIVITIES, // must be the last one
+};
+
+enum class C10_API_ENUM ProfilerState {
+  Disabled = 0,
+  CPU, // CPU-only profiling
+  CUDA, // CPU + CUDA events
+  NVTX, // only emit NVTX markers
+  ITT, // only emit ITT markers
+  KINETO, // use libkineto
+  KINETO_GPU_FALLBACK, // use CUDA events when CUPTI is not available
+  KINETO_ONDEMAND, // run the profiler in on-demand mode
+  NUM_PROFILER_STATES, // must be the last one
+};
+
+enum class C10_API_ENUM ActiveProfilerType {
+  NONE = 0,
+  LEGACY,
+  KINETO,
+  NVTX,
+  ITT
+};
+
+struct TORCH_API ExperimentalConfig {
+  ExperimentalConfig(
+      std::vector<std::string> profiler_metrics = {},
+      bool profiler_measure_per_kernel = false,
+      bool verbose = true);
+  ~ExperimentalConfig() = default;
+  explicit operator bool() const;
+
+  std::vector<std::string> profiler_metrics;
+  bool profiler_measure_per_kernel;
+  bool verbose;
+};
+
+struct TORCH_API ProfilerConfig {
+  ProfilerConfig(
+      ProfilerState state,
+      bool report_input_shapes = false,
+      bool profile_memory = false,
+      bool with_stack = false,
+      bool with_flops = false,
+      bool with_modules = false,
+      ExperimentalConfig experimental_config = ExperimentalConfig());
+  ~ProfilerConfig() = default;
+
+  bool disabled() const;
+  bool global() const;
+
+  ProfilerState state;
+  ExperimentalConfig experimental_config;
+  bool report_input_shapes;
+  bool profile_memory;
+  bool with_stack;
+  bool with_flops;
+  bool with_modules;
+
+  // For serialization
+  at::IValue toIValue() const;
+  static ProfilerConfig fromIValue(const at::IValue& profilerConfigIValue);
+};
+
+// ----------------------------------------------------------------------------
+// -- Profiler base class -----------------------------------------------------
+// ----------------------------------------------------------------------------
+struct TORCH_API ProfilerStateBase : public c10::MemoryReportingInfoBase {
+  explicit ProfilerStateBase(const ProfilerConfig& config);
+  ~ProfilerStateBase() override;
+
+  static ProfilerStateBase* get(bool global);
+  static ProfilerStateBase* get() {
+    auto* out = get(/*global=*/true);
+    return out ? out : get(/*global=*/false);
+  }
+
+  static void push(std::shared_ptr<ProfilerStateBase>&& state);
+
+  static std::shared_ptr<ProfilerStateBase> pop(bool global);
+  static std::shared_ptr<ProfilerStateBase> pop() {
+    auto out = pop(/*global=*/true);
+    return out ? out : pop(/*global=*/false);
+  }
+
+  const ProfilerConfig& config() const {
+    return config_;
+  }
+
+  void setCallbackHandle(at::CallbackHandle handle);
+  void removeCallback();
+
+  bool memoryProfilingEnabled() const override {
+    return config_.profile_memory;
+  }
+
+  virtual ActiveProfilerType profilerType() = 0;
+
+ protected:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::mutex state_mutex_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled);
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  at::CallbackHandle handle_ = 0;
+};
+
+// Note: The following are only for the active *thread local* profiler.
+TORCH_API bool profilerEnabled();
+TORCH_API ActiveProfilerType profilerType();
+TORCH_API ProfilerConfig getProfilerConfig();
+
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/torch/csrc/profiler/orchestration/python_tracer.cpp b/torch/csrc/profiler/orchestration/python_tracer.cpp
new file mode 100644
index 000000000000..8f63163089b3
--- /dev/null
+++ b/torch/csrc/profiler/orchestration/python_tracer.cpp
@@ -0,0 +1,37 @@
+#include <torch/csrc/profiler/orchestration/python_tracer.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+namespace python_tracer {
+namespace {
+MakeFn make_fn;
+
+struct NoOpPythonTracer : public PythonTracerBase {
+  NoOpPythonTracer() = default;
+  ~NoOpPythonTracer() = default;
+
+  void stop() override {}
+  std::vector<std::shared_ptr<Result>> getEvents(
+      std::function<time_t(approx_time_t)>,
+      std::vector<CompressedEvent>&,
+      time_t) override {
+    return {};
+  }
+};
+} // namespace
+
+void registerTracer(MakeFn make_tracer) {
+  make_fn = make_tracer;
+}
+
+std::unique_ptr<PythonTracerBase> PythonTracerBase::make(RecordQueue* queue) {
+  if (make_fn == nullptr) {
+    return std::make_unique<NoOpPythonTracer>();
+  }
+  return make_fn(queue);
+}
+} // namespace python_tracer
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/torch/csrc/profiler/orchestration/python_tracer.h b/torch/csrc/profiler/orchestration/python_tracer.h
new file mode 100644
index 000000000000..93becfee3cf3
--- /dev/null
+++ b/torch/csrc/profiler/orchestration/python_tracer.h
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <c10/util/strong_type.h>
+
+#include <torch/csrc/profiler/kineto_shim.h>
+#include <torch/csrc/profiler/util.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+class RecordQueue;
+struct Result;
+namespace python_tracer {
+
+using TraceKey = strong::type<
+    uint64_t,
+    struct TraceKey_,
+    strong::regular,
+    strong::hashable,
+    strong::ostreamable>;
+
+struct CompressedEvent {
+  TraceKey key_;
+  uint64_t system_tid_;
+  kineto::DeviceAndResource kineto_info_;
+  time_t enter_t_;
+};
+
+/*
+Libtorch does not depend on Python (e.g. cannot #include <Python.h>); however
+when we call the profiler from libtorch_python we need the profiler to be able
+to ingest the data that we collect from the Python tracer. (`PyEval_SetProfile`)
+
+In order to solve this dependency issue we define a virtual base and a function
+to register a getter. The python tracer then implements these functions and
+exposes itself by calling `registerTracer` from `torch/csrc/autograd/init.cpp`.
+This pattern of registration for faux python dependencies in libtorch is common
+in the PyTorch codebase.
+*/
+struct TORCH_API PythonTracerBase {
+  static std::unique_ptr<PythonTracerBase> make(RecordQueue* queue);
+  virtual ~PythonTracerBase() = default;
+
+  virtual void stop() = 0;
+  virtual std::vector<std::shared_ptr<Result>> getEvents(
+      std::function<time_t(approx_time_t)> time_converter,
+      std::vector<CompressedEvent>& enters,
+      time_t end_time_ns) = 0;
+};
+
+using MakeFn = std::unique_ptr<PythonTracerBase> (*)(RecordQueue*);
+TORCH_API void registerTracer(MakeFn make_tracer);
+} // namespace python_tracer
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp
index 43153f2c163c..0b86039fad52 100644
--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@@ -50,7 +50,8 @@ void initPythonBindings(PyObject* module) {
       .def(
           py::init<
               std::vector<std::string> /* profiler_metrics */,
-              bool /* profiler_measure_per_kernel */
+              bool /* profiler_measure_per_kernel */,
+              bool /* verbose */
               >(),
           "An experimental config for Kineto features. Please note that"
           "backward compatibility is not guaranteed.\n"
@@ -58,9 +59,11 @@ void initPythonBindings(PyObject* module) {
           "       to measure GPU performance events.\n"
           "       If this list contains values Kineto runs in CUPTI profiler mode\n"
           "    profiler_measure_per_kernel (bool) : whether to profile metrics per kernel\n"
-          "       or for the entire measurement duration.",
+          "       or for the entire measurement duration.\n"
+          "    verbose (bool) : whether the trace file has `Call stack` field or not.",
           py::arg("profiler_metrics") = std::vector<std::string>(),
-          py::arg("profiler_measure_per_kernel") = false)
+          py::arg("profiler_measure_per_kernel") = false,
+          py::arg("verbose") = true)
       .def(py::pickle(
           [](const ExperimentalConfig& p) { // __getstate__
             py::list py_metrics;
@@ -69,11 +72,12 @@ void initPythonBindings(PyObject* module) {
               py_metrics.append(mbytes);
             }
             /* Return a tuple that fully encodes the state of the config */
-            return py::make_tuple(py_metrics, p.profiler_measure_per_kernel);
+            return py::make_tuple(
+                py_metrics, p.profiler_measure_per_kernel, p.verbose);
           },
           [](py::tuple t) { // __setstate__
-            if (t.size() != 2) {
-              throw std::runtime_error("Expected 2 values in state");
+            if (t.size() != 3) {
+              throw std::runtime_error("Expected 3 values in state");
             }
 
             py::list py_metrics = t[0].cast<py::list>();
@@ -83,7 +87,8 @@ void initPythonBindings(PyObject* module) {
               metrics.push_back(py::str(py_metric));
             }
 
-            return ExperimentalConfig(std::move(metrics), t[1].cast<bool>());
+            return ExperimentalConfig(
+                std::move(metrics), t[1].cast<bool>(), t[2].cast<bool>());
           }));
 
   py::class_<ProfilerConfig>(m, "ProfilerConfig")
diff --git a/torch/csrc/profiler/util.h b/torch/csrc/profiler/util.h
index 8bee4275c22f..62b71da453c8 100644
--- a/torch/csrc/profiler/util.h
+++ b/torch/csrc/profiler/util.h
@@ -181,12 +181,11 @@ class TORCH_API GlobalStateManager {
     return singleton_;
   }
 
-  template <typename... Args>
-  static void init(Args... args) {
+  static void push(std::shared_ptr<T>&& state) {
     if (singleton().state_) {
       LOG(WARNING) << "GlobalStatePtr already exists!";
     } else {
-      singleton().state_ = std::make_shared<T>(std::forward<Args>(args)...);
+      singleton().state_ = std::move(state);
     }
   }
 
@@ -195,9 +194,6 @@ class TORCH_API GlobalStateManager {
   }
 
   static std::shared_ptr<T> pop() {
-    TORCH_INTERNAL_ASSERT(
-        singleton().state_ != nullptr,
-        "Global state ptr cannot be null before resetting");
     auto out = singleton().state_;
     singleton().state_.reset();
     return out;
diff --git a/torch/csrc/tensor/python_tensor.cpp b/torch/csrc/tensor/python_tensor.cpp
index bda95b00abdc..648fd0418681 100644
--- a/torch/csrc/tensor/python_tensor.cpp
+++ b/torch/csrc/tensor/python_tensor.cpp
@@ -431,7 +431,9 @@ static bool PyTensorType_Check(PyObject* obj) {
 
 void py_set_default_tensor_type(PyObject* obj) {
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  TORCH_CHECK_TYPE(PyTensorType_Check(obj), "invalid type object");
+  TORCH_CHECK_TYPE(
+      PyTensorType_Check(obj),
+      "invalid type object: only floating-point types are supported as the default type");
   PyTensorType* type = (PyTensorType*)obj;
   if (type->is_cuda && !torch::utils::cuda_enabled()) {
     throw unavailable_type(*type);
@@ -440,7 +442,9 @@ void py_set_default_tensor_type(PyObject* obj) {
 }
 
 void py_set_default_dtype(PyObject* obj) {
-  TORCH_CHECK_TYPE(THPDtype_Check(obj), "invalid dtype object");
+  TORCH_CHECK_TYPE(
+      THPDtype_Check(obj),
+      "invalid dtype object: only floating-point types are supported as the default type");
   auto scalar_type = ((THPDtype*)obj)->scalar_type;
   set_default_tensor_type(/*backend=*/c10::nullopt, scalar_type);
 }
diff --git a/torch/csrc/utils/pybind.h b/torch/csrc/utils/pybind.h
index f45caf6fac9c..7481ada3bb13 100644
--- a/torch/csrc/utils/pybind.h
+++ b/torch/csrc/utils/pybind.h
@@ -136,6 +136,33 @@ struct type_caster<at::Device> {
   }
 };
 
+template <>
+struct type_caster<c10::DispatchKey>
+    : public type_caster_base<c10::DispatchKey> {
+  using base = type_caster_base<c10::DispatchKey>;
+  c10::DispatchKey tmp;
+
+ public:
+  bool load(handle src, bool convert) {
+    if (base::load(src, convert)) {
+      return true;
+    } else if (py::isinstance(
+                   src, py::module_::import("builtins").attr("str"))) {
+      tmp = c10::parseDispatchKey(py::cast<std::string>(src));
+      value = &tmp;
+      return true;
+    }
+    return false;
+  }
+
+  static handle cast(
+      c10::DispatchKey src,
+      return_value_policy policy,
+      handle parent) {
+    return base::cast(src, policy, parent);
+  }
+};
+
 // Pybind11 bindings for our optional and variant types.
 // http://pybind11.readthedocs.io/en/stable/advanced/cast/stl.html#c-17-library-containers
 template <typename T>
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index 130825826ba6..2d40332260ca 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -92,6 +92,7 @@ bool should_allow_numbers_as_tensors(const std::string& name) {
       "sub",          "sub_",          "sub_out",
       "subtract",     "subtract_",     "subtract_out", // alias of sub
       "true_divide",  "true_divide_",  "true_divide_out",
+      "to",           "_to_copy",      "copy_",
       "floor_divide", "floor_divide_", "floor_divide_out"};
   return allowed.find(name) != allowed.end();
 }
@@ -376,8 +377,15 @@ auto handle_torch_function_no_python_arg_parser(
     // all __torch_function__ implementations in overloaded_args
     // returned NotImplemented, so we raise a TypeError.
     std::stringstream ss;
-    ss << "no implementation found for '" << module_name << "." << func_name
-       << "' on types that implement " << torch_function_name_str << ": [";
+    ss << "no implementation found for '";
+    if (module_name && func_name) {
+      ss << module_name << "." << func_name;
+    } else {
+      py::handle fn = torch_api_function;
+      ss << py::str(fn.attr("__module__")) << "."
+         << py::str(fn.attr("__name__"));
+    }
+    ss << "' on types that implement " << torch_function_name_str << ": [";
     for (auto& arg : overloaded_args) {
       ss << py::repr(get_type_of_overloaded_arg(arg.ptr()));
       if (!arg.is(overloaded_args.back())) {
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index 73bbdc09855f..b0bbcc2fdd20 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -295,11 +295,20 @@ void initDispatchBindings(PyObject* module) {
       // Returns whether or not a direct kernel registration exists
       // for this <op_name, dispatch_key> pair.
       "_dispatch_has_kernel_for_dispatch_key",
-      [](const char* name, const char* dispatch) -> bool {
+      [](const char* name, c10::DispatchKey dispatch) -> bool {
         auto op =
             c10::Dispatcher::singleton().findOp(torch::jit::parseName(name));
         TORCH_CHECK(op, "operator ", name, " does not exist");
-        return op->hasKernelForDispatchKey(c10::parseDispatchKey(dispatch));
+        return op->hasKernelForDispatchKey(dispatch);
+      });
+
+  m.def(
+      "_dispatch_has_kernel_for_any_dispatch_key",
+      [](const char* name, c10::DispatchKeySet ks) -> bool {
+        auto op =
+            c10::Dispatcher::singleton().findOp(torch::jit::parseName(name));
+        TORCH_CHECK(op, "operator ", name, " does not exist");
+        return op->hasKernelForAnyDispatchKey(ks);
       });
 
   m.def(
@@ -332,38 +341,48 @@ void initDispatchBindings(PyObject* module) {
 
   m.def(
       "_dispatch_tls_set_dispatch_key_excluded",
-      [](const char* dispatch_key, bool desired_state) {
-        c10::impl::tls_set_dispatch_key_excluded(
-            c10::parseDispatchKey(dispatch_key), desired_state);
+      [](c10::DispatchKey dispatch_key, bool desired_state) {
+        c10::impl::tls_set_dispatch_key_excluded(dispatch_key, desired_state);
+      });
+  m.def(
+      "_dispatch_tls_is_dispatch_key_excluded",
+      [](c10::DispatchKey dispatch_key) {
+        return c10::impl::tls_is_dispatch_key_excluded(dispatch_key);
       });
-  m.def("_dispatch_tls_is_dispatch_key_excluded", [](const char* dispatch_key) {
-    return c10::impl::tls_is_dispatch_key_excluded(
-        c10::parseDispatchKey(dispatch_key));
-  });
 
   m.def("_dispatch_isTensorSubclassLike", [](const at::Tensor& tensor) {
     return at::isTensorSubclassLike(tensor);
   });
 
-  m.def("_dispatch_key_name", [](uint64_t dispatch_key) {
-    auto dt = (c10::DispatchKey)dispatch_key;
-    return c10::toString(dt);
+  m.def("_dispatch_key_name", [](c10::DispatchKey k) {
+    return c10::toString(k);
   });
+  m.def("_dispatch_key_parse", [](c10::DispatchKey k) { return k; });
   m.def("_dispatch_num_backends", []() { return c10::num_backends; });
 
-  py::enum_<c10::DispatchKey>(m, "DispatchKey")
-      .value("Undefined", c10::DispatchKey::Undefined)
-      .value("Dense", c10::DispatchKey::Dense)
-      .value("BackendSelect", c10::DispatchKey::BackendSelect)
-      .value("CPU", c10::DispatchKey::CPU)
-      .value("CUDA", c10::DispatchKey::CUDA)
-      .value("AutocastCPU", c10::DispatchKey::AutocastCPU)
-      .value("AutocastCUDA", c10::DispatchKey::AutocastCUDA)
-      .value("AutogradCPU", c10::DispatchKey::AutogradCPU)
-      .value("ADInplaceOrView", c10::DispatchKey::ADInplaceOrView)
-      .value("AutogradCUDA", c10::DispatchKey::AutogradCUDA)
-      .value("PythonTLSSnapshot", c10::DispatchKey::PythonTLSSnapshot)
-      .value("Python", c10::DispatchKey::Python);
+#define DEF_ONE(n) .value(#n, c10::DispatchKey::n)
+
+  py::enum_<c10::DispatchKey>(m, "DispatchKey") DEF_ONE(Undefined)
+      DEF_ONE(CompositeExplicitAutogradNonFunctional)
+          DEF_ONE(CompositeExplicitAutograd)
+              DEF_ONE(CompositeImplicitAutogradNestedTensor)
+                  DEF_ONE(CompositeImplicitAutograd) DEF_ONE(AutogradOther)
+                      DEF_ONE(Autograd) DEF_ONE(BackendSelect)
+                          DEF_ONE(ADInplaceOrView) DEF_ONE(PythonTLSSnapshot)
+                              DEF_ONE(Python)
+
+#define DEF_SINGLE(n, prefix) .value(#prefix #n, c10::DispatchKey::prefix##n)
+#define DEF_MULTIPLE(fullname, prefix)              \
+  DEF_SINGLE(, fullname)                            \
+  DEF_SINGLE(, StartOf##fullname##Backends)         \
+  C10_FORALL_BACKEND_COMPONENTS(DEF_SINGLE, prefix) \
+  DEF_SINGLE(, EndOf##fullname##Backends)
+
+                                  C10_FORALL_FUNCTIONALITY_KEYS(DEF_MULTIPLE)
+
+#undef DEF_MULTIPLE
+#undef DEF_SINGLE
+                                      ;
 
   py::class_<c10::DispatchKeySet>(m, "DispatchKeySet")
       .def(py::init<c10::DispatchKey>())
@@ -371,7 +390,15 @@ void initDispatchBindings(PyObject* module) {
       .def("__sub__", &c10::DispatchKeySet::operator-)
       .def("__and__", &c10::DispatchKeySet::operator&)
       .def("highestPriorityTypeId", &c10::DispatchKeySet::highestPriorityTypeId)
-      .def("has", &c10::DispatchKeySet::has);
+      .def("has", &c10::DispatchKeySet::has)
+      .def("__repr__", [](c10::DispatchKeySet d) { return c10::toString(d); });
+
+  m.attr("_dispatch_autogradother_backends") =
+      py::cast(c10::autogradother_backends);
+
+  m.def("_dispatch_has_backend_fallback", [](c10::DispatchKey t) {
+    return c10::Dispatcher::singleton().hasBackendFallbackForDispatchKey(t);
+  });
 
   m.def("_dispatch_keyset_full_after", [](c10::DispatchKey t) {
     return c10::DispatchKeySet(c10::DispatchKeySet::FULL_AFTER, t);
@@ -381,6 +408,10 @@ void initDispatchBindings(PyObject* module) {
     return c10::toString(keyset);
   });
 
+  m.def("_dispatch_get_backend_keyset_from_autograd", [](c10::DispatchKey k) {
+    return c10::getBackendKeySetFromAutograd(k);
+  });
+
   m.def("_dispatch_keys", [](const at::Tensor& tensor) {
     auto* impl = tensor.unsafeGetTensorImpl();
     return impl->key_set();
@@ -391,6 +422,11 @@ void initDispatchBindings(PyObject* module) {
   m.def("_dispatch_tls_local_exclude_set", []() {
     return c10::impl::tls_local_dispatch_key_set().excluded_;
   });
+  m.def(
+      "_dispatch_is_included_in_alias",
+      [](c10::DispatchKey a, c10::DispatchKey b) {
+        return c10::isIncludedInAlias(a, b);
+      });
   py::class_<c10::impl::ExcludeDispatchKeyGuard>(m, "ExcludeDispatchKeyGuard")
       .def(py::init<c10::DispatchKeySet>());
 
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index c0299141e8d9..115e1c1036cb 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -15,7 +15,8 @@
 import traceback
 import warnings
 import threading
-from typing import List, Optional, Tuple, Union, Any
+from functools import lru_cache
+from typing import Any, List, Optional, Set, Tuple, Union
 from ._utils import _get_device_index, _dummy_type
 from .._utils import classproperty
 from .graphs import CUDAGraph, graph_pool_handle, graph, \
@@ -74,9 +75,13 @@ def get_calls(self) -> List:
 has_half: bool = False
 default_generators: Tuple[torch._C.Generator] = ()  # type: ignore[assignment]
 
+def _is_compiled() -> bool:
+    r"""Returns true if compile with CUDA support."""
+    return hasattr(torch._C, '_cuda_getDeviceCount')
+
 def is_available() -> bool:
     r"""Returns a bool indicating if CUDA is currently available."""
-    if not hasattr(torch._C, '_cuda_getDeviceCount'):
+    if not _is_compiled():
         return False
     # This function never throws and returns 0 if driver is missing or can't
     # be initialized
@@ -456,12 +461,70 @@ def set_stream(stream: Stream):
         return
     torch._C._cuda_setStream(stream._cdata)
 
+def _parse_visible_devices() -> Set[int]:
+    """Parse CUDA_VISIBLE_DEVICES environment variable."""
+    var = os.getenv("CUDA_VISIBLE_DEVICES")
+    if var is None:
+        return set(x for x in range(64))
+
+    def _strtoul(s: str) -> int:
+        """Return -1 or positive integer sequence string starts with,"""
+        if not s:
+            return -1
+        for idx, c in enumerate(s):
+            if not c.isdigit():
+                break
+            if idx + 1 == len(s):
+                idx += 1
+        return int(s[:idx]) if idx > 0 else -1
+
+    # CUDA_VISIBLE_DEVICES uses something like strtoul
+    # which makes `1gpu2,2ampere` is equivalent to `1,2`
+    rc: Set[int] = set()
+    for elem in var.split(","):
+        rc.add(_strtoul(elem.strip()))
+    return rc
+
+def _raw_device_count_nvml() -> int:
+    """Return number of devices as reported by NVML
+    or negative value if NVML discovery/initialization failed."""
+    from ctypes import CDLL, c_int
+    nvml_h = CDLL("libnvidia-ml.so.1")
+    rc = nvml_h.nvmlInit()
+    if rc != 0:
+        warnings.warn("Can't initialize NVML")
+        return -1
+    dev_arr = (c_int * 1)(-1)
+    rc = nvml_h.nvmlDeviceGetCount_v2(dev_arr)
+    if rc != 0:
+        warnings.warn("Can't get nvml device count")
+        return -1
+    del nvml_h
+    return dev_arr[0]
+
+def _device_count_nvml() -> int:
+    """Return number of devices as reported by NVML taking CUDA_VISIBLE_DEVICES into account.
+    Negative value is returned if NVML discovery or initialization has failed."""
+    visible_devices = _parse_visible_devices()
+    if not visible_devices:
+        return 0
+    try:
+        raw_cnt = _raw_device_count_nvml()
+    except OSError:
+        return -1
+    except AttributeError:
+        return -1
+    if raw_cnt <= 0:
+        return raw_cnt
+    return len(set(range(raw_cnt)).intersection(visible_devices))
+
+@lru_cache(maxsize=1)
 def device_count() -> int:
     r"""Returns the number of GPUs available."""
-    if is_available():
-        return torch._C._cuda_getDeviceCount()
-    else:
+    if not _is_compiled():
         return 0
+    nvml_count = _device_count_nvml()
+    return torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count
 
 def get_arch_list() -> List[str]:
     r"""Returns list CUDA architectures this library was compiled for."""
@@ -757,3 +820,30 @@ def dtype(self):
 from . import nvtx
 from . import amp
 from . import jiterator
+
+__all__ = [
+    # Typed storage and tensors
+    'BFloat16Storage', 'BFloat16Tensor',
+    'BoolStorage', 'BoolTensor',
+    'ByteStorage', 'ByteTensor',
+    'CharStorage', 'CharTensor',
+    'ComplexDoubleStorage', 'ComplexFloatStorage',
+    'DoubleStorage', 'DoubleTensor',
+    'FloatStorage', 'FloatTensor',
+    'HalfStorage', 'HalfTensor',
+    'IntStorage', 'IntTensor',
+    'LongStorage', 'LongTensor',
+    'ShortStorage', 'ShortTensor',
+    'CUDAGraph', 'CudaError', 'DeferredCudaCallError', 'Device', 'Event', 'ExternalStream', 'OutOfMemoryError',
+    'Stream', 'StreamContext', 'amp', 'caching_allocator_alloc', 'caching_allocator_delete', 'can_device_access_peer',
+    'check_error', 'cudaStatus', 'cudart', 'current_blas_handle', 'current_device', 'current_stream', 'default_generators',
+    'default_stream', 'device', 'device_count', 'device_of', 'empty_cache', 'get_arch_list', 'get_device_capability',
+    'get_device_name', 'get_device_properties', 'get_gencode_flags', 'get_rng_state', 'get_rng_state_all', 'get_sync_debug_mode',
+    'graph', 'graph_pool_handle', 'graphs', 'has_half', 'has_magma', 'init', 'initial_seed', 'ipc_collect', 'is_available',
+    'is_bf16_supported', 'is_current_stream_capturing', 'is_initialized', 'jiterator', 'list_gpu_processes',
+    'make_graphed_callables', 'manual_seed', 'manual_seed_all', 'max_memory_allocated', 'max_memory_cached', 'max_memory_reserved',
+    'mem_get_info', 'memory', 'memory_allocated', 'memory_cached', 'memory_reserved', 'memory_snapshot', 'memory_stats',
+    'memory_stats_as_nested_dict', 'memory_summary', 'memory_usage', 'nccl', 'nvtx', 'profiler', 'random',
+    'reset_accumulated_memory_stats', 'reset_max_memory_allocated', 'reset_max_memory_cached', 'reset_peak_memory_stats',
+    'seed', 'seed_all', 'set_device', 'set_per_process_memory_fraction', 'set_rng_state', 'set_rng_state_all', 'set_stream',
+    'set_sync_debug_mode', 'sparse', 'stream', 'streams', 'synchronize', 'utilization']
diff --git a/torch/cuda/_sanitizer.py b/torch/cuda/_sanitizer.py
index aa98b02eefc9..0473b04f3f91 100644
--- a/torch/cuda/_sanitizer.py
+++ b/torch/cuda/_sanitizer.py
@@ -5,8 +5,8 @@
 possible data race is detected, a detailed warning will be printed and the program
 will exit.
 
-It can be enabled either by importing this module and using
-:func:`enable_cuda_sanitizer()` or by exporting ``TORCH_CUDA_SANITIZER``
+It can be enabled either by importing this module and calling
+:func:`enable_cuda_sanitizer()` or by exporting the ``TORCH_CUDA_SANITIZER``
 environment variable.
 """
 
@@ -26,6 +26,8 @@
 from torch.utils._pytree import tree_map
 
 
+DEFAULT_STREAM_ID = 0
+
 TK = TypeVar("TK")
 TVa = TypeVar("TVa")
 TVb = TypeVar("TVb")
@@ -219,6 +221,8 @@ class StreamSynchronizations:
     def __init__(self):
         self.current_sync_states: Dict[StreamId, Dict[StreamId, SeqNum]] = {}
         self.recorded_sync_states: Dict[EventId, Dict[StreamId, SeqNum]] = {}
+        self.host_sync_state: Dict[StreamId, SeqNum] = {}
+        self.create_stream(DEFAULT_STREAM_ID)
 
     def _ensure_stream_exists(self, stream: StreamId) -> None:
         if stream not in self.current_sync_states:
@@ -272,7 +276,8 @@ def create_stream(self, stream: StreamId) -> None:
                 )
             )
         else:
-            self.current_sync_states[stream] = {}
+            self.host_sync_state[stream] = 0
+            self.current_sync_states[stream] = self.host_sync_state.copy()
 
     def create_event(self, event: EventId) -> None:
         self._ensure_event_does_not_exist(event)
@@ -291,13 +296,43 @@ def record_state(self, event: EventId, stream: StreamId) -> None:
         self._ensure_stream_exists(stream)
         self.recorded_sync_states[event] = self.current_sync_states[stream].copy()
 
-    def state_wait_for_event(self, stream: StreamId, event: EventId) -> None:
+    def _state_wait_for_other(
+        self, state: Dict[StreamId, SeqNum], other: Dict[StreamId, SeqNum]
+    ) -> None:
+        for stream, seq_num in other.items():
+            state[stream] = max(state.get(stream, -1), seq_num)
+
+    def stream_wait_for_event(self, stream: StreamId, event: EventId) -> None:
+        self._ensure_stream_exists(stream)
+        self._ensure_event_exists(event)
+        self._state_wait_for_other(
+            self.current_sync_states[stream], self.recorded_sync_states[event]
+        )
+
+    def all_streams_wait_for_event(self, event: EventId) -> None:
         self._ensure_event_exists(event)
+        for stream in self.current_sync_states.keys():
+            self.stream_wait_for_event(stream, event)
+
+        self._state_wait_for_other(
+            self.host_sync_state, self.recorded_sync_states[event]
+        )
+
+    def all_streams_wait_for_stream(self, stream: StreamId) -> None:
         self._ensure_stream_exists(stream)
-        for other_stream, seq_num in self.recorded_sync_states[event].items():
-            self.current_sync_states[stream][other_stream] = max(
-                self.current_sync_states[stream].get(other_stream, -1), seq_num
-            )
+        for state in self.current_sync_states.values():
+            self._state_wait_for_other(state, self.current_sync_states[stream])
+
+        self._state_wait_for_other(
+            self.host_sync_state, self.current_sync_states[stream]
+        )
+
+    def sync_all_streams(self) -> None:
+        for stream, state in self.current_sync_states.items():
+            self.host_sync_state[stream] = state[stream]
+
+        for state in self.current_sync_states.values():
+            self._state_wait_for_other(state, self.host_sync_state)
 
     def is_ordered_after(
         self, current_stream: StreamId, seq_num: SeqNum, other_stream: StreamId
@@ -351,6 +386,9 @@ def check_conflict(
         stack_trace = traceback.StackSummary.extract(
             traceback.walk_stack(None), lookup_lines=False
         )
+        # The stack trace generated in this way is in the inverse order, so it must be
+        # reversed.
+        stack_trace.reverse()
 
         for data_ptr in read_only:
             self.tensors_accessed.ensure_tensor_exists(data_ptr)
@@ -398,15 +436,19 @@ def _handle_event_record(self, event: EventId, stream: StreamId) -> None:
         self.syncs.record_state(event, stream)
 
     def _handle_event_wait(self, event: EventId, stream: StreamId) -> None:
-        self.syncs.state_wait_for_event(stream, event)
+        self.syncs.stream_wait_for_event(stream, event)
 
     def _handle_memory_allocation(self, data_ptr: DataPtr) -> None:
         self.tensors_accessed.ensure_tensor_does_not_exist(data_ptr)
+        stack_trace = traceback.StackSummary.extract(
+            traceback.walk_stack(None), lookup_lines=False
+        )
+        # The stack trace generated in this way is in the inverse order, so it must be
+        # reversed.
+        stack_trace.reverse()
         self.tensors_accessed.create_tensor(
             data_ptr,
-            traceback.StackSummary.extract(
-                traceback.walk_stack(None), lookup_lines=False
-            ),
+            stack_trace,
         )
 
     def _handle_memory_deallocation(self, data_ptr: DataPtr) -> None:
@@ -416,6 +458,15 @@ def _handle_memory_deallocation(self, data_ptr: DataPtr) -> None:
     def _handle_stream_creation(self, stream: StreamId) -> None:
         self.syncs.create_stream(stream)
 
+    def _handle_device_synchronization(self) -> None:
+        self.syncs.sync_all_streams()
+
+    def _handle_stream_synchronization(self, stream: StreamId) -> None:
+        self.syncs.all_streams_wait_for_stream(stream)
+
+    def _handle_event_synchronization(self, event: EventId) -> None:
+        self.syncs.all_streams_wait_for_event(event)
+
 
 def zip_by_key(a: Dict[TK, TVa], b: Dict[TK, TVb]) -> Iterator[Tuple[TK, TVa, TVb]]:
     for arg, value in a.items():
@@ -497,6 +548,15 @@ def __init__(self):
         cuda_trace.register_callback_for_cuda_stream_creation(
             self.event_handler._handle_stream_creation
         )
+        cuda_trace.register_callback_for_cuda_device_synchronization(
+            self.event_handler._handle_device_synchronization
+        )
+        cuda_trace.register_callback_for_cuda_stream_synchronization(
+            self.event_handler._handle_stream_synchronization
+        )
+        cuda_trace.register_callback_for_cuda_event_synchronization(
+            self.event_handler._handle_event_synchronization
+        )
 
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         if kwargs is None:
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index 30f4e0f2ff2f..e80c03dab99c 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -614,3 +614,6 @@ def _save_memory_usage(filename='output.svg', snapshot=None):
         snapshot = memory_snapshot()
     with open(filename, 'w') as f:
         f.write(_memory(snapshot))
+
+def _set_allocator_settings(env: str):
+    return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env)
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index d7a12acce511..cb188cefcc84 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -68,7 +68,9 @@ def is_available() -> bool:
     )
 
     from .rendezvous import (
+        rendezvous,
         _create_store_from_options,
+        register_rendezvous_handler,
     )
 
     from .remote_device import _remote_device
diff --git a/torch/distributed/_sharding_spec/__init__.py b/torch/distributed/_sharding_spec/__init__.py
index 11e9e9a3deed..f3060005dbdd 100644
--- a/torch/distributed/_sharding_spec/__init__.py
+++ b/torch/distributed/_sharding_spec/__init__.py
@@ -9,4 +9,6 @@
     "torch.distributed._sharding_spec will be deprecated, use torch.distributed._shard.sharding_spec instead",
     DeprecationWarning
 )
-sys.modules['torch.distributed._sharding_spec'] = torch.distributed._shard.sharding_spec
+
+import torch.distributed._shard.sharding_spec as _sharding_spec
+sys.modules['torch.distributed._sharding_spec'] = _sharding_spec
diff --git a/torch/distributed/_spmd/__init__.py b/torch/distributed/_spmd/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/torch/distributed/_spmd/comm_tensor.py b/torch/distributed/_spmd/comm_tensor.py
new file mode 100644
index 000000000000..8e9b41cc29af
--- /dev/null
+++ b/torch/distributed/_spmd/comm_tensor.py
@@ -0,0 +1,241 @@
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, List, Optional, Tuple
+
+
+import torch
+from torch._C import _disabled_torch_function_impl
+from torch.fx.experimental.proxy_tensor import (
+    _ProxyTensor,
+    fetch_tensor_proxy,
+    get_proxy,
+    get_proxy_slots,
+    set_proxy_slot,
+    track_tensor_tree,
+)
+from torch.utils._mode_utils import no_dispatch
+from torch.utils._pytree import (
+    tree_flatten,
+    tree_map,
+    tree_map_only,
+)
+
+
+@dataclass
+class _CommResult:
+    # a custom type wrapping both inplace output tensor and work handle
+    _tensor: torch.Tensor
+    _work: torch.distributed._Work
+
+
+def _wait_comm(comm_result: _CommResult):
+    # This function is only used by tracing mode as a call_function node right
+    # before consuming a collective result tensor.
+    comm_result._work.wait()
+    return comm_result._tensor
+
+
+def _wrap_comm_result(result: Tuple[Any, Any]) -> Tuple[Any, Any]:
+    def wrap(work, e):
+        assert isinstance(e, torch.Tensor), (
+            "Excepting collection of tensors as the first element in the "
+            "return value of communication operations."
+        )
+
+        return _CommResult(e, work)
+
+    # E.g.,
+    # allreduce_ returns ([tensor], work)
+    # allgather_ returns ([[tensor1, tensor2]], work)
+    work = result[1]
+    return (tree_map(partial(wrap, work), result[0]), work)
+
+
+def _get_tracer(obj: Any) -> Optional[torch.fx.Tracer]:
+    slots = get_proxy_slots(obj)
+    if slots is None:
+        return None
+    keys = tuple(slots.keys())
+    assert len(keys) == 1
+    return keys[0]
+
+
+class CommTensor(torch.Tensor):
+    r"""
+    A Tensor subclass to wrap input tensors for collective communications. This
+    Tensor subclass works for both eager and tracing mode.
+
+    In eager mode, it will record whether the inplace collective communication
+    has been launched using this Tensor and remember the corresponding work
+    handle. If yes, it will expliclty call wait() in the ``__torch_dispatch__``
+    function before subsequent operations consuming the value of the Tensor.
+
+    In tracing mode, ``CommTensor`` inserts two node into the graph using the
+    ``__torch_dispatch__`` function.
+    1. The first node is inserted right after the
+    communication, wrapping both the inplace output tensor and the returned
+    work handle into a custom ``_CommResult`` type. We have to do this because
+    ``ProxyTorchDispatchMode`` only handles ``torch.Tensor``, ``_ProxyTensor``,
+    and ``torch.nn.Parameter`` objects and will treat the work handle
+    as a constant and embed that into the graph. As a result, during execution,
+    it will use the work handle created during tracing and will lead to wrong
+    result. The solution in this test is to manually create a proxy on the
+    return value of ``allreduce_`` which is ``([tensor], work)``, and wrap that
+    to ``[(_CommResult(tensor, work)), work]``. In this way, subsequent nodes can
+    directly consume ``_CommResult``.
+    2. The second node is inserted right before any subsequent node reads from
+    ``_CommResult``. It will call ``wait()`` on the stashed work handle to ensure
+    that computation waits for communication.
+    """
+
+    _supported_comms: List[str] = [
+        "allreduce_",
+        "allgather_",
+        "broadcast_",
+        "reduce_scatter_",
+        "scatter_",
+    ]
+
+    _tensor: torch.Tensor
+    _work: Optional[torch.distributed._Work]
+
+    @staticmethod
+    def __new__(cls, tensor: torch.Tensor):
+        t = tensor._tensor if isinstance(tensor, CommTensor) else tensor
+        if _get_tracer(t) is None:
+            # noop for eager mode
+            return tensor
+
+        # Use non-CommTensor to avoid nested CommTensor Wrapping
+        r = torch.Tensor._make_subclass(cls, t, require_grad=t.requires_grad)
+        # The tensor object wrapped by this CommTensor
+        r._tensor = tensor  # type: ignore[attr-defined]
+        # Record the LAST `work` object returned by collective communication
+        # operations. If this is None, it means no collectives have called
+        # since last time a tensor is wrapped by CommTensor
+        r._work = None  # type: ignore[attr-defined]
+        return r
+
+    def __repr__(self):
+        return f"CommTensor({self._tensor}, work={self._work})"
+
+    # disable __torch_function__ so that CommTensor can recursively dispatch
+    # with ProxyTorchDispatchMode in make_fx
+    __torch_function__ = _disabled_torch_function_impl
+
+    @classmethod
+    def _is_supported(cls, op_name):
+        return any([comm in op_name for comm in cls._supported_comms])
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        # shared states when unwrapping args
+        tracer: Optional[torch.fx.Tracer] = None
+        work: Optional[torch.distributed._Work] = None
+
+        # wrapped ._tensor if this is a CommTensor, and insert/call wait()
+        # if communication has been launched on this tensor.
+        def unwrap(e: Any):
+            if isinstance(e, CommTensor):
+                nonlocal tracer, work
+
+                work = e._work
+                tracer = _get_tracer(e._tensor)
+
+                if work is not None:
+                    if tracer is not None:
+                        # insert a node to the traced graph.
+                        proxy_res = tracer.create_proxy(  # type: ignore[union-attr]
+                            'call_function',
+                            _wait_comm,
+                            (get_proxy(e._tensor).proxy,),
+                            {},
+                            name="wait_comm"
+                        )
+                        # HACK: update the proxy for the inplace output
+                        set_proxy_slot(e._tensor, tracer, proxy_res)
+                    # For eager mode, simply wait.
+                    # During tracing, still need to wait here, to make sure the
+                    # execution during tracing is correct.
+                    work.wait()
+
+                # communication has been waited, stop propagating CommTensor
+                return e._tensor
+            else:
+                return e
+
+        def wrap(e: Any):
+            return CommTensor(e) if isinstance(e, torch.Tensor) else e
+
+        def set_work(work: torch.distributed._Work, e: Any):
+            if isinstance(e, CommTensor):
+                e._work = work  # type: ignore[attr-defined]
+            elif isinstance(e, torch.Tensor):
+                raise RuntimeError(
+                    "Type of output tensors from collective communication during "
+                    "tracing should always be CommTensor instead of torch.Tensor"
+                )
+            return e
+
+        unwrapped_args = tree_map(unwrap, args)
+        unwrapped_kwargs = tree_map(unwrap, kwargs)
+
+        if cls._is_supported(func.__name__):
+            if tracer is not None:
+                # in tracing mode, get proxies for args
+                proxy_args, proxy_kwargs = tree_map_only(
+                    _ProxyTensor,
+                    lambda e: e.proxy,
+                    tree_map_only(
+                        torch.Tensor,
+                        fetch_tensor_proxy(tracer),
+                        (unwrapped_args, unwrapped_kwargs)
+                    ),
+                )
+
+                # get proxy for output tuple
+                proxy_res = func(*proxy_args, **proxy_kwargs)
+                # insert a node that wraps the output tuple into
+                # _CommResult(tensor, work)
+                comm_result_proxy = tracer.create_proxy(  # type: ignore[union-attr]
+                    'call_function',
+                    _wrap_comm_result,
+                    (proxy_res, ),
+                    {},
+                    name="comm_result"
+                )
+
+                with no_dispatch():
+                    # disable dispatch to avoid trigger ProxyTorchDispatchMode logic
+                    out = func(*unwrapped_args, **unwrapped_kwargs)
+
+                # wrap output with the proxy of _CommResult, so that subsequent
+                # ops and link to it.
+                track_tensor_tree(out, comm_result_proxy, constant=None, tracer=tracer)
+
+                # N.B.: we still need to remember the work handle here, and wait
+                # for it later to make sure the execution during tracing is
+                # correct. Also, remember comm is already launched
+                # args[0] is always the collection of output tensors
+                tree_map(partial(set_work, out[1]), args[0])
+
+                # HACK: update the proxy on the input argument as this is an
+                # inplace collective communication.
+                flat_args, args_spec = tree_flatten(unwrapped_args[0])
+                flat_out, out_spec = tree_flatten(out[0])
+                for a, o in zip(flat_args, flat_out):
+                    set_proxy_slot(a, tracer, get_proxy(o))
+
+                return out
+            else:
+                # in eager mode, simply remember work handle as an attribute
+                out = func(*unwrapped_args, **unwrapped_kwargs)
+                tree_map(partial(set_work, out[1]), args[0])
+                return out
+        else:
+            if work is not None:
+                return func(*unwrapped_args, **unwrapped_kwargs)
+            else:
+                # we need to propagate CommTensor wrapping until the first
+                # subsequent operation has waited for it.
+                return tree_map(wrap, func(*unwrapped_args, **unwrapped_kwargs))
diff --git a/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py b/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
index 53364c74f224..3924e11e58c6 100644
--- a/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
+++ b/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
@@ -1,13 +1,12 @@
-from enum import Enum, auto
-from contextlib import suppress
+from enum import auto, Enum
+from functools import partial
+from typing import Any, Dict, Iterator, Tuple
 
 import torch
+import torch.nn as nn
 from torch.autograd.graph import save_on_cpu
+from torch.distributed.utils import _pack_kwargs, _replace_by_prefix, _unpack_kwargs
 from torch.utils.checkpoint import checkpoint
-from torch.distributed.utils import _replace_by_prefix
-import torch.nn as nn
-from typing import Any, Dict, Iterator, Tuple
-from functools import partial
 
 _CHECKPOINT_PREFIX = "_checkpoint_wrapped_module"
 
@@ -18,7 +17,9 @@ class CheckpointImpl(Enum):
 
 class CheckpointWrapper(torch.nn.Module):
     """
-    An nn.Module that wraps another nn.Module with checkpointing.
+    An nn.Module that wraps another nn.Module with checkpointing. Note that this
+    module is not meant to be used directly, but instead it is to be used
+    through the ``checkpoint_wrapper`` function.
     """
     def __init__(
         self,
@@ -33,20 +34,23 @@ def __init__(
         self._checkpoint_wrapped_module = mod
         self.checkpoint_impl = checkpoint_impl
         self.offload_to_cpu = offload_to_cpu
-        if checkpoint_fn is None:
-            # use torch.utils.checkpoint
-            self.checkpoint_fn = partial(
-                checkpoint,
-                use_reentrant=(
-                    self.checkpoint_impl == CheckpointImpl.REENTRANT
-                ),
-            )
+        if self.offload_to_cpu:
+            self.checkpoint_fn = None
         else:
-            self.checkpoint_fn = partial(
-                checkpoint_fn,
-                *checkpoint_fn_args,
-                **checkpoint_fn_kwargs,
-            )
+            if checkpoint_fn is None:
+                # use torch.utils.checkpoint
+                self.checkpoint_fn = partial(
+                    checkpoint,
+                    use_reentrant=(
+                        self.checkpoint_impl == CheckpointImpl.REENTRANT
+                    ),
+                )
+            else:
+                self.checkpoint_fn = partial(
+                    checkpoint_fn,
+                    *checkpoint_fn_args,
+                    **checkpoint_fn_kwargs,
+                )
         # state_dict post hook to remove prefix to allow loading into a
         # non-checkpoint wrapped module.
         self._register_state_dict_hook(self._post_state_dict_hook)
@@ -68,13 +72,42 @@ def __getitem__(self, key: int) -> Any:
         return self._checkpoint_wrapped_module.__getitem__(key)  # type: ignore[operator]
 
     def forward(self, *args, **kwargs):
-        offload_mgr = save_on_cpu(pin_memory=True) if self.offload_to_cpu else suppress()
-        with offload_mgr:  # type: ignore[attr-defined]
-            return self.checkpoint_fn(
-                self._checkpoint_wrapped_module,
-                *args,
-                **kwargs
-            )
+        if self.offload_to_cpu:
+            with save_on_cpu(pin_memory=True):
+                return self._checkpoint_wrapped_module(*args, **kwargs)
+        else:
+            # Support keyword arguments for reentrant checkpoint. Note that this
+            # only works if user has specified self.checkpoint_impl and is not
+            # using their own custom checkpoint_fn.
+            if self.checkpoint_impl == CheckpointImpl.REENTRANT and kwargs != {}:
+                # Pack the args and kwargs
+                flat_args, kwarg_keys = _pack_kwargs(*args, **kwargs)
+
+                # Function that only takes (packed) args, but can unpack them
+                # into the original args and kwargs for the checkpointed
+                # function, and runs that function.
+                def my_function(*inputs):
+                    # unpack back into args and kwargs
+                    unpacked_args, unpacked_kwargs = _unpack_kwargs(
+                        inputs, kwarg_keys
+                    )
+                    # run original module
+                    return self._checkpoint_wrapped_module(
+                        *unpacked_args, **unpacked_kwargs
+                    )
+
+                # Pass the function that only takes packed args into reentrant
+                # checkpoint API.
+                return self.checkpoint_fn(  # type: ignore[misc]
+                    my_function,
+                    *flat_args,
+                )
+            else:
+                return self.checkpoint_fn(  # type: ignore[misc]
+                    self._checkpoint_wrapped_module,
+                    *args,
+                    **kwargs
+                )
 
     def named_parameters(
         self,
@@ -142,14 +175,20 @@ def checkpoint_wrapper(
         module (nn.Module):
             The module to be wrapped
         checkpoint_impl (Optional[CheckpointImpl]):
-            The checkpointing implementation to use. Currently only
-            CheckpointImpl.REENTRANT is supported. Note that this will only
+            The checkpointing implementation to use. Note that this will only
             be passed into the ``torch.utils.checkpoint.checkpoint``
             implementation, and is ignored if a custom ``checkpoint_fn`` is
-            specified.
+            specified. Note that for implementations using reentrant checkpoint
+            from ``torch.utils.checkpoint``, keyword arguments will only be
+            supported if ``checkpoint_impl`` is passed as ``CheckpointImpl.REENTRANT`.
         offload_to_cpu (Optional[bool]):
-            Whether to offload outer activations to CPU. Note that this
-            currently only works with CheckpointImpl.REENTRANT.
+            Whether to offload activations of this wrapped module to CPU. Note
+            that if this is specified, ``checkpoint_impl`` and ``checkpoint_fn``
+            arguments will be ignored in favor of the activations being
+            offloaded to CPU. Default is ``False``. Wrappers with activation
+            offload can be composed with ones that do recomputation-based
+            checkpoint to trade off increased compute versus increased CPU
+            memory usage and additional H2D transfers.
         checkpoint_fn (Optional[Callable]):
             Functional checkpoint implementation to use. If this is specified,
             it will be used over the default ``torch.utils.checkpoint.checkpoint``
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py
index c8eb83cc1564..b3b45025a84a 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py
@@ -9,6 +9,8 @@
 )
 from torch.nn.parallel.distributed import DistributedDataParallel
 
+__all__ = ["hook_with_zero_step", "hook_with_zero_step_interleaved"]
+
 # Functional optimizers require passing a list of gradients to their `step()`
 # method, and ZeRO requires a functional optimizer to overlap with DDP
 # Passing a `None` instead of an actual gradient indicates to the optimizer
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py
index 3d1bb120f122..9cad6f1b0573 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py
@@ -3,6 +3,7 @@
 import torch
 from torch.distributed import GradBucket
 
+__all__ = ["noop_hook"]
 
 
 def noop_hook(_: Any, bucket: GradBucket) -> torch.futures.Future[torch.Tensor]:
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
index 48114b9716b8..a9a6aa3ddb0e 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
@@ -3,6 +3,7 @@
 import torch
 import torch.distributed as dist
 
+__all__ = ["allreduce_hook", "fp16_compress_hook", "bf16_compress_hook", "fp16_compress_wrapper", "bf16_compress_wrapper"]
 
 def _allreduce_fut(
     process_group: dist.ProcessGroup, tensor: torch.Tensor
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py
index 405612e555ca..9d09ab04fd1b 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py
@@ -1,8 +1,10 @@
-from typing import Any, Callable
+from typing import Any, Callable, List
 
 import torch
 import torch.distributed as dist
 
+__all__: List[str] = []
+
 _FUNCTIONAL_OPTIM_STEP_METHOD_NAME = "step_param"
 
 class _OptimizerHookState(object):
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index cd9c8ee9bc3e..fbb9556fd289 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1,3 +1,5 @@
+import itertools
+import collections.abc
 import contextlib
 import io
 import logging
@@ -6,7 +8,7 @@
 import time
 import warnings
 from datetime import timedelta
-from typing import Callable, Dict, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Optional, Tuple, Union
 
 import torch
 from torch._C._distributed_c10d import (
@@ -25,16 +27,34 @@
     Store,
     DebugLevel,
     get_debug_level,
+    Work
 )
 from torch._six import string_classes
 
 from .constants import default_pg_timeout
 from .rendezvous import register_rendezvous_handler, rendezvous  # noqa: F401
 
-
-# This module is wildcard imported from torch.distributed.
-# TODO: specify __all__
-
+__all__ = [
+    'Backend', 'GroupMember', 'P2POp', 'all_gather', 'all_gather_coalesced',
+    'all_gather_multigpu', 'all_gather_object', 'all_reduce',
+    'all_reduce_coalesced', 'all_reduce_multigpu', 'all_to_all',
+    'all_to_all_single', 'barrier', 'batch_isend_irecv', 'broadcast',
+    'broadcast_multigpu', 'broadcast_object_list', 'destroy_process_group',
+    'dist_backend', 'gather', 'gather_object', 'get_backend', 'get_rank',
+    'get_world_size', 'group', 'init_process_group', 'irecv',
+    'is_gloo_available', 'is_initialized', 'is_mpi_available',
+    'is_nccl_available', 'is_torchelastic_launched', 'is_ucc_available',
+    'isend', 'monitored_barrier', 'new_group', 'new_subgroups',
+    'new_subgroups_by_enumeration', 'recv', 'reduce', 'reduce_multigpu',
+    'reduce_scatter', 'reduce_scatter_multigpu', 'scatter',
+    'scatter_object_list', 'send', 'supports_complex',
+    'AllreduceCoalescedOptions', 'AllreduceOptions', 'AllToAllOptions',
+    'BarrierOptions', 'BroadcastOptions', 'GatherOptions', 'PrefixStore',
+    'ProcessGroup', 'ReduceOp', 'ReduceOptions', 'ReduceScatterOptions',
+    'ScatterOptions', 'Store', 'DebugLevel', 'get_debug_level', 'Work',
+    'default_pg_timeout', 'get_group_rank', 'get_global_rank', 'get_process_group_ranks',
+    'reduce_op',
+]
 
 _MPI_AVAILABLE = True
 _NCCL_AVAILABLE = True
@@ -44,25 +64,56 @@
 _pickler = pickle.Pickler
 _unpickler = pickle.Unpickler
 
+# Change __module__ of all imported types from torch._C._distributed_c10d that are public
+def _export_c_types():
+    _public_types_to_change_module = [
+        AllreduceCoalescedOptions,
+        AllreduceOptions,
+        AllToAllOptions,
+        BarrierOptions,
+        BroadcastOptions,
+        GatherOptions,
+        PrefixStore,
+        ProcessGroup,
+        ReduceOp,
+        ReduceOptions,
+        ReduceScatterOptions,
+        ScatterOptions,
+        Store,
+        DebugLevel,
+        get_debug_level,
+        Work
+    ]
+    for type in _public_types_to_change_module:
+        type.__module__ = "torch.distributed.distributed_c10d"
+_export_c_types()
+
 try:
     from torch._C._distributed_c10d import ProcessGroupMPI
+    ProcessGroupMPI.__module__ = "torch.distributed.distributed_c10d"
+    __all__ += ["ProcessGroupMPI"]
 except ImportError:
     _MPI_AVAILABLE = False
 
 try:
     from torch._C._distributed_c10d import ProcessGroupNCCL
+    ProcessGroupNCCL.__module__ = "torch.distributed.distributed_c10d"
+    __all__ += ["ProcessGroupNCCL"]
 except ImportError:
     _NCCL_AVAILABLE = False
 
 try:
     from torch._C._distributed_c10d import ProcessGroupGloo
     from torch._C._distributed_c10d import _ProcessGroupWrapper
+    ProcessGroupGloo.__module__ = "torch.distributed.distributed_c10d"
+    __all__ += ["ProcessGroupGloo"]
 except ImportError:
     _GLOO_AVAILABLE = False
 
 try:
     from torch._C._distributed_c10d import ProcessGroupUCC
     ProcessGroupUCC.__module__ = "torch.distributed.distributed_c10d"
+    __all__ += ["ProcessGroupUCC"]
 except ImportError:
     _UCC_AVAILABLE = False
 
@@ -407,6 +458,26 @@ def _check_tensor_list(param, param_name):
             "to be of type List[torch.Tensor].".format(param_name)
         )
 
+def _as_iterable(obj) -> collections.abc.Iterable:
+    return obj if isinstance(obj, list) else (obj,)
+
+def _ensure_all_tensors_same_dtype(*tensors) -> None:
+    last_dtype = None
+    for tensor in itertools.chain(*map(_as_iterable, tensors)):
+        tensor_dtype = tensor.dtype
+        # Mixing complex and its element type is allowed
+        if tensor_dtype.is_complex:
+            tensor_dtype = torch.float32 if tensor_dtype == torch.complex64 else torch.complex128
+
+        if last_dtype is None:
+            last_dtype = tensor_dtype
+        else:
+            if last_dtype != tensor_dtype:
+                raise RuntimeError(
+                    "Invalid usage of tensors with different dtypes"
+                    f"Found {last_dtype} and  {tensor.dtype}"
+                )
+
 
 def _check_op(op):
     """
@@ -438,42 +509,42 @@ def _check_p2p_op_list(p2p_op_list):
         raise RuntimeError("All ops need to use the same group.")
 
 
-def is_mpi_available():
+def is_mpi_available() -> bool:
     """
     Checks if the MPI backend is available.
     """
     return _MPI_AVAILABLE
 
 
-def is_nccl_available():
+def is_nccl_available() -> bool:
     """
     Checks if the NCCL backend is available.
     """
     return _NCCL_AVAILABLE
 
 
-def is_gloo_available():
+def is_gloo_available() -> bool:
     """
     Checks if the Gloo backend is available.
     """
     return _GLOO_AVAILABLE
 
 
-def is_ucc_available():
+def is_ucc_available() -> bool:
     """
     Checks if the UCC backend is available.
     """
     return _UCC_AVAILABLE
 
 
-def is_initialized():
+def is_initialized() -> bool:
     """
     Checking if the default process group has been initialized
     """
     return GroupMember.WORLD is not None
 
 
-def is_torchelastic_launched():
+def is_torchelastic_launched() -> bool:
     """
     Checks whether this process was launched with ``torch.distributed.elastic``
     (aka torchelastic). The existence of ``TORCHELASTIC_RUN_ID`` environment
@@ -515,7 +586,7 @@ def _update_default_pg(pg):
     GroupMember.WORLD = group.WORLD = pg
 
 
-def get_backend(group=None):
+def get_backend(group: Optional[ProcessGroup] = None) -> str:
     """
     Returns the backend of the given process group.
 
@@ -540,14 +611,14 @@ def get_backend(group=None):
 
 
 def init_process_group(
-    backend,
-    init_method=None,
-    timeout=default_pg_timeout,
-    world_size=-1,
-    rank=-1,
-    store=None,
-    group_name="",
-    pg_options=None,
+    backend: Union[str, Backend],
+    init_method: Optional[str] = None,
+    timeout: timedelta = default_pg_timeout,
+    world_size: int = -1,
+    rank: int = -1,
+    store: Optional[Store] = None,
+    group_name: str = "",
+    pg_options: Optional[Any] = None,
 ):
     """
     Initializes the default distributed process group, and this will also
@@ -865,7 +936,7 @@ def _new_process_group_helper(
     return pg
 
 
-def destroy_process_group(group=None):
+def destroy_process_group(group: Optional[ProcessGroup] = None):
     """
     Destroy a given process group, and deinitialize the distributed package
 
@@ -915,7 +986,7 @@ def destroy_process_group(group=None):
         del _pg_group_ranks[pg]
 
 
-def get_rank(group=None):
+def get_rank(group: Optional[ProcessGroup] = None) -> int:
     """
     Returns the rank of the current process in the provided ``group`` or the
     default group if none was provided.
@@ -943,7 +1014,7 @@ def get_rank(group=None):
     return get_group_rank(group, default_pg.rank())
 
 
-def get_world_size(group=None):
+def get_world_size(group: Optional[ProcessGroup] = None) -> int:
     """
     Returns the number of processes in the current process group
 
@@ -962,7 +1033,7 @@ def get_world_size(group=None):
     return _get_group_size(group)
 
 
-def isend(tensor, dst, group=None, tag=0):
+def isend(tensor: torch.Tensor, dst: int, group: Optional[ProcessGroup] = None, tag: int = 0) -> Work:
     """
     Sends a tensor asynchronously.
 
@@ -995,7 +1066,7 @@ def isend(tensor, dst, group=None, tag=0):
         return group.send([tensor], group_dst_rank, tag)
 
 
-def irecv(tensor, src=None, group=None, tag=0):
+def irecv(tensor: torch.Tensor, src: Optional[int] = None, group: Optional[ProcessGroup] = None, tag: int = 0) -> Work:
     """
     Receives a tensor asynchronously.
 
@@ -1032,7 +1103,7 @@ def irecv(tensor, src=None, group=None, tag=0):
             return pg.recv([tensor], group_src_rank, tag)
 
 
-def send(tensor, dst, group=None, tag=0):
+def send(tensor: torch.Tensor, dst: int, group: Optional[ProcessGroup] = None, tag: int = 0) -> Work:
     """
     Sends a tensor synchronously.
 
@@ -1057,7 +1128,7 @@ def send(tensor, dst, group=None, tag=0):
         group.send([tensor], group_dst_rank, tag).wait()
 
 
-def recv(tensor, src=None, group=None, tag=0):
+def recv(tensor: torch.Tensor, src: Optional[int] = None, group: Optional[ProcessGroup] = None, tag: int = 0) -> Work:
     """
     Receives a tensor synchronously.
 
@@ -1458,6 +1529,7 @@ def all_reduce_coalesced(tensors, op=ReduceOp.SUM, group=None, async_op=False):
 
     """
     _check_tensor_list(tensors, "tensor")
+    _ensure_all_tensors_same_dtype(tensors)
     if _rank_not_in_group(group):
         _warn_not_in_group("all_reduce_coalesced")
         return
@@ -2125,6 +2197,7 @@ def all_gather(tensor_list, tensor, group=None, async_op=False):
     """
     _check_tensor_list(tensor_list, "tensor_list")
     _check_single_tensor(tensor, "tensor")
+    _ensure_all_tensors_same_dtype(tensor_list, tensor)
     if _rank_not_in_group(group):
         _warn_not_in_group("all_gather")
         return
@@ -2265,12 +2338,14 @@ def all_gather_coalesced(
         _warn_not_in_group("all_gather_coalesced")
         return
     _check_tensor_list(input_tensor_list, "tensor_list")
+    _ensure_all_tensors_same_dtype(input_tensor_list)
     if not isinstance(output_tensor_lists, list):
         raise RuntimeError(
             "Invalid function argument: " "output_tensor_lists should be a list"
         )
     for output_tensor_list in output_tensor_lists:
         _check_tensor_list(output_tensor_list, "output_tensor_lists")
+        _ensure_all_tensors_same_dtype(output_tensor_list)
 
     output_tensor_lists = [
         [t if not t.is_complex() else torch.view_as_real(t) for t in l]
@@ -2331,6 +2406,7 @@ def gather(tensor, gather_list=None, dst=0, group=None, async_op=False):
         _check_tensor_list(gather_list, "gather_list")
     else:
         gather_list = []
+    _ensure_all_tensors_same_dtype(tensor, gather_list)
 
     if _rank_not_in_group(group):
         _warn_not_in_group("gather")
@@ -2388,6 +2464,7 @@ def scatter(tensor, scatter_list=None, src=0, group=None, async_op=False):
         _check_tensor_list(scatter_list, "scatter_list")
     else:
         scatter_list = []
+    _ensure_all_tensors_same_dtype(tensor, scatter_list)
 
     if _rank_not_in_group(group):
         _warn_not_in_group("scatter")
@@ -2501,6 +2578,9 @@ def reduce_scatter(output, input_list, op=ReduceOp.SUM, group=None, async_op=Fal
     Args:
         output (Tensor): Output tensor.
         input_list (list[Tensor]): List of tensors to reduce and scatter.
+        op (optional): One of the values from
+            ``torch.distributed.ReduceOp``
+            enum.  Specifies an operation used for element-wise reductions.
         group (ProcessGroup, optional): The process group to work on. If None,
             the default process group will be used.
         async_op (bool, optional): Whether this op should be an async op.
@@ -2512,6 +2592,7 @@ def reduce_scatter(output, input_list, op=ReduceOp.SUM, group=None, async_op=Fal
     """
     _check_single_tensor(output, "output")
     _check_tensor_list(input_list, "input_list")
+    _ensure_all_tensors_same_dtype(output, input_list)
     if _rank_not_in_group(group):
         _warn_not_in_group("reduce_scatter")
         return
@@ -2673,6 +2754,7 @@ def all_to_all_single(
     opts = AllToAllOptions()
     _check_single_tensor(output, "output")
     _check_single_tensor(input, "input")
+    _ensure_all_tensors_same_dtype(output, input)
 
     if input.is_complex():
         input = torch.view_as_real(input)
@@ -2796,6 +2878,7 @@ def all_to_all(output_tensor_list, input_tensor_list, group=None, async_op=False
     opts = AllToAllOptions()
     _check_tensor_list(output_tensor_list, "output_tensor_list")
     _check_tensor_list(input_tensor_list, "input_tensor_list")
+    _ensure_all_tensors_same_dtype(output_tensor_list, input_tensor_list)
 
     input_tensor_list = [
         t if not t.is_complex() else torch.view_as_real(t) for t in input_tensor_list
diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index a2a412eb3aa5..382073ff2554 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -3,7 +3,6 @@
 import functools
 from typing import (
     Any,
-    cast,
     Dict,
     Iterable,
     Iterator,
@@ -13,13 +12,14 @@
     Sequence,
     Tuple,
     Union,
+    cast,
 )
 
 import torch
 import torch.distributed as dist
-
 # Import the entire FSDP file to avoid circular imports
 import torch.distributed.fsdp.fully_sharded_data_parallel as FSDP
+import torch.nn as nn
 from torch.distributed._shard.sharded_tensor import ShardedTensor
 from torch.distributed.fsdp._shard_utils import (
     _create_chunk_sharded_tensor,
@@ -28,6 +28,12 @@
 from torch.distributed.fsdp.flat_param import FlatParameter, FlatParamHandle
 
 
+def sorted_items(dictionary: Dict[str, Any]) -> Iterator[Tuple[str, Any]]:
+    keys = sorted(dictionary.keys())
+    for k in keys:
+        yield k, dictionary[k]
+
+
 class _ConsolidatedOptimState:
     """
     This holds the consolidated optimizer state on the target rank. Positive-
@@ -165,15 +171,20 @@ def _communicate_optim_state(
     group = fsdp_module.process_group
 
     tensor_buffer = None  # initialize lazily in case it is not needed
-    for state_name, value in flat_param_state.items():
+    for state_name, value in sorted_items(flat_param_state):
         # Positive-dimension tensor state: communicate across ranks
         if torch.is_tensor(value) and value.dim() > 0:
-            # If the parameter is not sharded (e.g. world size of 1), then
-            # neither is the positive-dimension tensor state, so no need to
-            # communicate it -- we take the target rank's value
-            if not flat_param._is_sharded:  # type: ignore[attr-defined]
+            # If the parameter is not sharded, then neither is the
+            # positive-dimension tensor state, so no need to communicate it --
+            # we take the target rank's value
+            if (
+                fsdp_module.world_size == 1
+                or fsdp_module.sharding_strategy == FSDP.ShardingStrategy.NO_SHARD
+            ):
                 tensor_state[state_name] = value.cpu()
                 continue
+            if not value.is_cuda:
+                value = value.to(fsdp_module.compute_device)
             if tensor_buffer is None:
                 # Assume that positive-dimension tensor optimizer state
                 # has the same shape as the sharded flattened parameter
@@ -182,7 +193,7 @@ def _communicate_optim_state(
             dist._all_gather_base(tensor_buffer, value, group=group)
             torch.cuda.synchronize()
             if to_save:
-                unpadded_numel = flat_param._unsharded_size.numel()  # type: ignore[attr-defined]
+                unpadded_numel = flat_param._unpadded_unsharded_size.numel()  # type: ignore[attr-defined]
                 tensor_state[state_name] = tensor_buffer[:unpadded_numel].cpu()
         # Zero-dimension tensor state and non-tensor state: take this rank's
         # value directly
@@ -228,7 +239,7 @@ def _unflatten_communicated_optim_state(
     for _ in range(num_unflat_params):
         unflat_state_param = {}
         # Add positive-dimension tensor state: unflatten with views
-        for state_name, flat_tensor in tensor_state.items():
+        for state_name, flat_tensor in sorted_items(tensor_state):
             views_generated = state_name in flat_param_views
             if not views_generated:
                 views = FlatParamHandle._get_unflat_views(flat_param, flat_tensor)
@@ -247,10 +258,10 @@ def _unflatten_communicated_optim_state(
             unflat_state_param[state_name] = optim_state
 
         # Add zero-dimension tensor state: take the target rank's value
-        for state_name, zero_dim_tensor in zero_dim_tensor_state.items():
+        for state_name, zero_dim_tensor in sorted_items(zero_dim_tensor_state):
             unflat_state_param[state_name] = zero_dim_tensor
         # Add non-tensor state: take the target rank's value
-        for state_name, non_tensor in non_tensor_state.items():
+        for state_name, non_tensor in sorted_items(non_tensor_state):
             unflat_state_param[state_name] = non_tensor
         unflat_param_state.append(unflat_state_param)
     return unflat_param_state
@@ -309,7 +320,7 @@ def _flatten_optim_state_dict(
             flat_osd_state[key] = copy.copy(unflat_osd_state[unflat_param_name])
 
     # Construct the "param_groups" part -- copy as is since it will be
-    # rekeyed later according to the target rank's `optim_input`
+    # rekeyed later according to the target rank's optimizer
     flat_osd_param_groups = copy.deepcopy(unflat_osd["param_groups"])
     return {"state": flat_osd_state, "param_groups": flat_osd_param_groups}
 
@@ -522,7 +533,7 @@ def _flatten_tensor_optim_state(
         for state_value, shape in zip(pos_dim_tensors, unflat_param_shapes)
     ]
     flat_tensor = torch.cat(tensors)
-    flat_param_shape = flat_param._unsharded_size  # type: ignore[attr-defined]
+    flat_param_shape = flat_param._unpadded_unsharded_size  # type: ignore[attr-defined]
     assert flat_tensor.shape == flat_param_shape, (
         f"tensor optim state: {flat_tensor.shape} "
         f"flattened parameter: {flat_param_shape}"
@@ -645,7 +656,7 @@ def _process_pos_dim_tensor_state(
     no_tensor_osd: Dict[str, Any] = {"state": {}}
     for key, param_state in flat_osd["state"].items():
         no_tensor_osd["state"][key] = {}
-        for state_name, value in param_state.items():
+        for state_name, value in sorted_items(param_state):
             is_pos_dim_tensor_state = torch.is_tensor(value) and value.dim() > 0
             if not is_pos_dim_tensor_state:
                 no_tensor_osd["state"][key][state_name] = value
@@ -723,7 +734,7 @@ def _broadcast_pos_dim_tensor_states(
     no_tensor_osd = processed_optim_state_dict  # alias
     flat_osd = flat_optim_state_dict  # alias
     for key, param_state in no_tensor_osd["state"].items():
-        for state_name, value in param_state.items():
+        for state_name, value in sorted_items(param_state):
             is_pos_dim_tensor_state = isinstance(value, _PosDimTensorInfo)
             if not is_pos_dim_tensor_state:
                 continue
@@ -858,31 +869,37 @@ def _broadcast_unsharded_pos_dim_tensor_state(
 def _rekey_sharded_optim_state_dict(
     sharded_osd: Dict[str, Any],
     model: torch.nn.Module,
+    optim: torch.optim.Optimizer,
     optim_input: Optional[
         Union[
             List[Dict[str, Any]],
             Iterable[torch.nn.Parameter],
         ]
-    ] = None,
+    ],
+    using_optim_input: bool,
 ) -> Dict[str, Any]:
     """
     Rekeys the optimizer state dict from unflattened parameter names to
-    flattened parameter IDs according to the calling rank's ``optim_input``,
-    which may be different across ranks. In particular, the unflattened
-    parameter names are represented as :class:`_OptimStateKey` s.
+    flattened parameter IDs according to the calling rank's ``optim``, which
+    may be different across ranks. In particular, the unflattened parameter
+    names are represented as :class:`_OptimStateKey` s.
     """
-    param_to_flat_param_id = _get_param_to_param_id(model, optim_input)
+    param_to_flat_param_id = (
+        _get_param_to_param_id_from_optim_input(model, optim_input)
+        if using_optim_input
+        else _get_param_to_param_id(optim)
+    )
     param_to_unflat_param_names = FSDP._get_param_to_unflat_param_names(model)
     # All parameter keys in `param_to_flat_param_id` should be in
     # `param_to_unflat_param_names` -- strict inequality follows when not all
-    # parameters are passed to the optimizer via `optim_input`
+    # parameters are passed to the optimizer
     assert len(param_to_flat_param_id) <= len(param_to_unflat_param_names)
 
     unflat_param_names_to_flat_param_id: Dict[Tuple[str, ...], int] = {}  # for "state"
     unflat_param_name_to_flat_param_id: Dict[str, int] = {}  # for "param_groups"
     for param, unflat_param_names in param_to_unflat_param_names.items():
         if param not in param_to_flat_param_id:
-            # This parameter was not passed to the optimizer via `optim_input`
+            # This parameter was not passed to the optimizer
             continue
         flat_param_id = param_to_flat_param_id[param]
         unflat_param_names_to_flat_param_id[tuple(unflat_param_names)] = flat_param_id
@@ -933,6 +950,20 @@ def _get_flat_param_to_fsdp_module(model: torch.nn.Module):
 
 
 def _get_param_id_to_param(
+    optim: torch.optim.Optimizer,
+):
+    """
+    Constructs a mapping from parameter IDs to parameters. This may be used
+    both for models with ``FlatParameter`` s and without.
+    """
+    param_id_to_param: List[nn.Parameter] = []
+    for param_group in optim.param_groups:
+        for param in param_group["params"]:
+            param_id_to_param.append(param)
+    return param_id_to_param
+
+
+def _get_param_id_to_param_from_optim_input(
     model: torch.nn.Module,
     optim_input: Optional[
         Union[
@@ -945,6 +976,10 @@ def _get_param_id_to_param(
     Constructs a mapping from parameter IDs to parameters. This may be used
     both for models with ``FlatParameter`` s and without.
 
+    NOTE: This method is only preserved for backward compatibility. The method
+    :meth:`_get_param_id_to_param` is the preferred code path that does not
+    rely on ``optim_input``.
+
     NOTE: We critically assume that, whether the optimizer input is a list of
     parameters or a list of parameter groups, :class:`torch.optim.Optimizer`
     enumerates the parameter IDs in order. In other words, for a parameter list
@@ -1005,6 +1040,14 @@ def _get_param_id_to_param(
 
 
 def _get_param_to_param_id(
+    optim: torch.optim.Optimizer,
+) -> Dict[torch.nn.Parameter, int]:
+    """Constructs the inverse mapping of :func:`_get_param_id_to_param`."""
+    param_id_to_param = _get_param_id_to_param(optim)
+    return {param: param_id for param_id, param in enumerate(param_id_to_param)}
+
+
+def _get_param_to_param_id_from_optim_input(
     model: torch.nn.Module,
     optim_input: Optional[
         Union[
@@ -1014,7 +1057,7 @@ def _get_param_to_param_id(
     ] = None,
 ) -> Dict[torch.nn.Parameter, int]:
     """Constructs the inverse mapping of :func:`_get_param_id_to_param`."""
-    param_id_to_param = _get_param_id_to_param(model, optim_input)
+    param_id_to_param = _get_param_id_to_param_from_optim_input(model, optim_input)
     return {param: param_id for param_id, param in enumerate(param_id_to_param)}
 
 
@@ -1073,10 +1116,11 @@ def _optim_state_dict(
             List[Dict[str, Any]],
             Iterable[torch.nn.Parameter],
         ]
-    ] = None,
-    rank0_only: bool = True,
-    shard_state: bool = False,
-    group: Optional[dist.ProcessGroup] = None,
+    ],
+    rank0_only: bool,
+    shard_state: bool,
+    group: Optional[dist.ProcessGroup],
+    using_optim_input: bool,
 ) -> Dict[str, Any]:
     """
     Consolidates the optimizer state and returns it as a :class:`dict`
@@ -1091,11 +1135,6 @@ def _optim_state_dict(
             were passed into the optimizer ``optim``.
         optim (torch.optim.Optimizer): Optimizer for ``model`` 's
             parameters.
-        optim_input (Optional[Union[List[Dict[str, Any]], Iterable[torch.nn.Parameter]]]):
-            Input passed into the optimizer ``optim`` representing either a
-            :class:`list` of parameter groups or an iterable of parameters;
-            if ``None``, then this method assumes the input was
-            ``model.parameters()``. (Default: ``None``)
         rank0_only (bool): If ``True``, saves the populated :class:`dict`
             only on rank 0; if ``False``, saves it on all ranks. (Default:
             ``True``)
@@ -1121,8 +1160,10 @@ def _optim_state_dict(
     param_to_unflat_param_names: Dict[
         torch.nn.Parameter, List[str]
     ] = FSDP._get_param_to_unflat_param_names(model)
-    flat_param_id_to_param: List[torch.nn.Parameter] = _get_param_id_to_param(
-        model, optim_input
+    flat_param_id_to_param: List[torch.nn.Parameter] = (
+        _get_param_id_to_param_from_optim_input(model, optim_input)
+        if using_optim_input
+        else _get_param_id_to_param(optim)
     )
     optim_state_key_to_flat_param_id: Dict[_OptimStateKey, int] = {}  # local
     r0_flat_param_id_to_optim_state_key: Dict[
@@ -1206,7 +1247,7 @@ def _optim_state_dict(
             assert len(r0_optim_state_key.unflat_param_names) == 1
             unflat_param_name = r0_optim_state_key.unflat_param_names[0]
             fsdp_osd_state[unflat_param_name] = copy.copy(osd_state[flat_param_id])
-            for state_name, value in fsdp_osd_state[unflat_param_name].items():
+            for state_name, value in sorted_items(fsdp_osd_state[unflat_param_name]):
                 if torch.is_tensor(value):
                     fsdp_osd_state[unflat_param_name][state_name] = value.cpu()
 
diff --git a/torch/distributed/fsdp/_utils.py b/torch/distributed/fsdp/_utils.py
index 95ae6dd3f9c9..80688e5dec03 100644
--- a/torch/distributed/fsdp/_utils.py
+++ b/torch/distributed/fsdp/_utils.py
@@ -10,7 +10,9 @@
 )
 from torch.nn.utils.rnn import PackedSequence
 
-"""Useful functions to deal with tensor types with other python container types."""
+
+FSDP_FLATTENED = "_fsdp_flattened"
+
 
 def _contains_batchnorm(module):
     return any(
@@ -124,6 +126,19 @@ def _free_storage(tensor: torch.Tensor) -> bool:
     return not already_freed
 
 
+def _set_fsdp_flattened(tensor: torch.Tensor) -> None:
+    """
+    Sets an attribute on ``tensor`` to mark it as flattened by FSDP. This is to
+    avoid re-flattening it during nested construction.
+    """
+    setattr(tensor, FSDP_FLATTENED, True)
+
+
+def _is_fsdp_flattened(tensor: torch.Tensor) -> bool:
+    """Returns if ``tensor`` has been marked as flattened by FSDP."""
+    return getattr(tensor, FSDP_FLATTENED, False)
+
+
 def p_assert(cond: Any, s: Any, raise_assertion_error: bool = True) -> None:
     """This is used as an alternate to ``assert`` when in the backward context
     to print the error message ``s`` since otherwise, it is swallowed."""
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 6554ecd28e4f..36b8bc0e6295 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -1,6 +1,9 @@
 import contextlib
+from dataclasses import dataclass
+from enum import auto, Enum
 from itertools import accumulate, chain
 from typing import (
+    cast,
     Dict,
     Generator,
     Iterator,
@@ -14,16 +17,22 @@
 )
 
 import torch
+import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
 
+from ._utils import _alloc_storage, _free_storage, _set_fsdp_flattened, p_assert
+
 __all__ = [
     "FlatParameter",
     "FlatParamHandle",
     "FlatParamShardMetadata",
     "ParamInfo",
     "SharedParamInfo",
+    "HandleConfig",
+    "HandleShardingStrategy",
+    "HandleTrainingState",
 ]
 
 
@@ -76,6 +85,31 @@ class FlatParamShardMetadata(NamedTuple):
     param_offsets: Tuple[Tuple[int, int], ...]
 
 
+# TODO (awgu): Prefix these with "Handle" for now to avoid circular imports and
+# inadvertent misuses; coalesce with those in fully_sharded_data_parallel.py
+# later
+class HandleShardingStrategy(Enum):
+    FULL_SHARD = auto()
+    SHARD_GRAD_OP = auto()
+    NO_SHARD = auto()
+
+
+class HandleTrainingState(Enum):
+    IDLE = auto()
+    FORWARD = auto()
+    BACKWARD_PRE = auto()
+    BACKWARD_POST = auto()
+    SUMMON_FULL_PARAMS = auto()
+
+
+@dataclass
+class HandleConfig:
+    sharding_strategy: HandleShardingStrategy
+    offload_params: bool
+    param_dtype: Optional[torch.dtype]
+    reduce_dtype: Optional[torch.dtype]
+
+
 class FlatParameter(nn.Parameter):
     """
     This is the flattened parameter used by :class:`FullyShardedDataParallel`.
@@ -97,9 +131,11 @@ class FlatParameter(nn.Parameter):
         flattened parameter, or the unsharded flattened parameter.
 
     Attributes:
-        _is_sharded (bool): Whether the flattened parameter is *ever* sharded
-            across ranks (not whether it is *currently* sharded).
-        _unsharded_size (torch.Size): Unsharded flattened parameter's size.
+        _unpadded_unsharded_size (torch.Size): Unsharded flattened parameter's
+            size without padding.
+        _padded_unsharded_size (torch.Size): Unsharded flattened parameter's
+            size with padding. This is only set for sharded strategies since
+            they require padding for the all-gather.
 
         _param_infos (Tuple[ParamInfo, ...]): Each parameter's parameter info
             entry; see :class:`ParamInfo`.
@@ -129,19 +165,31 @@ class FlatParameter(nn.Parameter):
         _shard_numel_padded (int): Numel padded for this rank's sharded
             flattened parameter.
 
-        _local_shard (Tensor): Sharded flattened parameter with padding.
+        _local_shard (Tensor): Sharded flattened parameter with padding if
+            using a sharded strategy. If using ``NO_SHARD``, then this is the
+            unpadded unsharded flattened parameter, and there is no notion of a
+            sharded flattened parameter or padded unsharded flattened
+            parameter.
         _full_param_padded (Tensor): Unsharded flattened parameter with
-            padding.
-        _shard_bwd_hook (Tuple[AccumulateGrad, RemovableHandle]): Flattened
-            parameter's :class:`AccumulateGrad` object and post-backward hook
-            handle.
-        _mp_shard (Tensor): Reduced-precision flattened parameter with padding.
+            padding. This is not defined for ``NO_SHARD``. When using mixed
+            precision for parameters, this has the low precision.
+        _full_prec_full_param_padded (Tensor): Full precision unsharded
+            flattened parameter with padding. This is used for unsharding
+            outside of computation when using mixed precision for parameters.
+            This is never defined for ``NO_SHARD``.
+        _post_backward_hook_state (Tuple[AccumulateGrad, RemovableHandle]):
+            Flattened parameter's :class:`AccumulateGrad` object and
+            post-backward hook handle.
+        _mp_shard (Tensor): Low precision sharded flattened parameter with
+            padding. This is only defined when parameter mixed precision is
+            enabled. For ``NO_SHARD``, this is used for computation.
         _cpu_grad (Tensor): Sharded gradient with padding stored on CPU.
+            This is only defined when offloading parameters is enabled.
         _saved_grad_shard (Tensor): Sharded gradient with padding from previous
             iterations for gradient accumulation without :meth:`no_sync`.
     """
 
-    def init_metadata(
+    def _init_metadata(
         self,
         param_infos: List[ParamInfo],
         numels: List[int],
@@ -173,13 +221,14 @@ def init_metadata(
         self._shapes = tuple(shapes)
         self._prefixed_param_names = tuple(prefixed_param_names)
         self._shared_param_infos = tuple(shared_param_infos)
-        self._is_sharded = False
-        self._unsharded_size = self.size()
+        self._unpadded_unsharded_size = self.size()
+        _set_fsdp_flattened(self)
 
 
 class FlatParamHandle:
     """
-    This handle manages a flattened parameter (:class:`FlatParameter`).
+    This handle manages a flattened parameter (:class:`FlatParameter`). This
+    includes sharding and view management.
 
     Args:
         params (Sequence[nn.Parameter]): The parameters to use for the
@@ -188,21 +237,33 @@ class FlatParamHandle:
             all parameters in ``params``; for non-recursive wrapping, this must
             be the top-level module, while for recursive wrapping, this may not
             necessarily be the top-level module.
+        device (torch.device): The compute and communication device, which
+            should be a non-CPU device. We refer to it as the compute device.
+        config (HandleConfig): A config customizing the handle based on FSDP's
+            available features.
     """
 
+    ##################
+    # INITIALIZATION #
+    ##################
     def __init__(
         self,
         params: Sequence[nn.Parameter],
         module: nn.Module,
+        device: torch.device,
+        config: HandleConfig,
     ) -> None:
         super().__init__()
-        self._init_flat_param(module, params)
+        self.device = device
+        self._config = config
+        self._training_state = HandleTrainingState.IDLE
+        self._init_flat_param(params, module)
         self._unflatten(as_params=False)
 
     def _init_flat_param(
         self,
-        module: nn.Module,
         params: Sequence[Optional[nn.Parameter]],
+        module: nn.Module,
     ) -> None:
         """
         Initializes the flattened parameter ``self.flat_param`` by flattening
@@ -250,13 +311,15 @@ def _init_flat_param(
                         )
                     )
                 else:
-                    if isinstance(param, FlatParameter):
+                    if type(param) is FlatParameter:
                         raise ValueError("`FlatParameter` does not support nesting")
                     if dtype is not None and param.dtype != dtype:
                         raise ValueError(
                             "`FlatParameter` requires uniform dtype but got "
                             f"{dtype} and {param.dtype}"
                         )
+                    if dtype is None and not param.is_floating_point():
+                        raise ValueError("Integer parameters are unsupported")
                     if (
                         requires_grad is not None
                         and param.requires_grad != requires_grad
@@ -281,7 +344,7 @@ def _init_flat_param(
         self.flat_param = FlatParamHandle.flatten_params(
             params_to_flatten, requires_grad
         )
-        self.flat_param.init_metadata(
+        self.flat_param._init_metadata(
             param_infos,
             numels,
             shapes,
@@ -301,7 +364,7 @@ def flatten_params(
 
         We expose this factory method for checkpointing (e.g. sharded state
         dict). The flattened parameter's metadata should only be initialized
-        once (see :meth:`init_metadata`), but its tensor data may be reloaded.
+        once (see :meth:`_init_metadata`), but its tensor data may be reloaded.
         """
         with torch.no_grad():
             flat_params = [
@@ -312,87 +375,43 @@ def flatten_params(
         flat_param = FlatParameter(flat_param_data, requires_grad=requires_grad)
         return flat_param
 
-    @staticmethod
-    def _get_unflat_views(
-        flat_param: FlatParameter,
-        tensor: Optional[torch.Tensor] = None,
-    ) -> Iterator[Tensor]:
-        """
-        Returns unflattened ``Tensor`` views into ``tensor`` if it is not
-        ``None`` or ``flat_param`` otherwise, where the unflattening is based
-        on ``flat_param`` 's metadata.
-
-        In other words, to get views into the unsharded flattened parameter,
-        pass ``tensor`` as ``None``, but to get views into tensor optimizer
-        state, pass ``tensor`` as the optimizer state tensor.
+    ###################################
+    # SHARD INITIALIZATION & METADATA #
+    ###################################
+    @torch.no_grad()
+    def shard(self, process_group: dist.ProcessGroup):
         """
-        if tensor is None:
-            tensor = flat_param
-        assert tensor.numel() == flat_param._unsharded_size.numel(), (
-            f"Expects {flat_param._unsharded_size.numel()} numel but got "
-            f"{tensor.numel()} numel"
-        )
-        views = (
-            subtensor.view(shape)
-            for (subtensor, shape) in zip(
-                torch.split(tensor, flat_param._numels, dim=0), flat_param._shapes  # type: ignore[arg-type]
-            )
-        )
-        return views
+        Shards the handle's ``FlatParameter``. In terms of memory, this
+        allocates new memory for the sharded flattened parameter and frees the
+        unsharded flattened parameter's storage.
 
-    def _unflatten(self, as_params: bool) -> None:
-        """
-        Unflattens the unsharded flattened parameter by setting the original
-        module parameter variables to be views into it.
+        Postcondition: ``self.flat_param`` is the sharded flattened parameter.
+        ``process_group``, ``rank``, and ``world_size`` attributes are set.
 
-        Args:
-            as_params (bool): If ``True``, then registers the original
-                parameters as ``nn.Parameter`` s; if ``False``, then registers
-                the original parameters only as ``Tensor`` s. ``False`` should
-                be used during forward/backward computation and when hiding the
-                original parameters from :meth:`nn.Module.named_parameters`.
+        TODO (awgu): Once we retire ``FlattenParamsWrapper``, we should pass
+        the process group directly to the ``FlatParamHandle`` constructor. For
+        now, we decouple ``FlattenParamsWrapper` from a process group, but this
+        makes the process-group-related attributes not necessarily defined.
         """
-        views = self._get_unflat_views(self.flat_param)
-        for view, (param_name, module, _) in zip(views, self.flat_param._param_infos):
-            if hasattr(module, param_name):
-                delattr(module, param_name)
-            if as_params:
-                module.register_parameter(param_name, nn.Parameter(view))
-            else:
-                setattr(module, param_name, view)
-        for (
-            param_name,
-            module,
-            _,
-            prim_param_name,
-            prim_module,
-            _,
-        ) in self.flat_param._shared_param_infos:
-            if hasattr(module, param_name):
-                delattr(module, param_name)
-            assert hasattr(prim_module, prim_param_name)
-            param: Union[Tensor, nn.Parameter] = getattr(prim_module, prim_param_name)
-            if as_params:
-                assert isinstance(param, nn.Parameter)
-                module.register_parameter(param_name, param)
-            else:
-                setattr(module, param_name, param)
-
-    @contextlib.contextmanager
-    def unflatten_as_params(self) -> Generator:
-        """
-        Assumes the flattened parameter is unsharded. When in the context,
-        unflattens the original parameters as ``nn.Parameter`` views into the
-        flattened parameter, and after the context, restores the original
-        parameters as ``Tensor`` views into the flattened parameter.
-        """
-        self._unflatten(as_params=True)
-        try:
-            yield
-        finally:
-            self._unflatten(as_params=False)
+        if not self.uses_sharded_strategy:
+            return
+        flat_param = self.flat_param
+        self.process_group = process_group
+        self.rank = process_group.rank()
+        self.world_size = process_group.size()
+        assert (
+            flat_param.storage_offset() == 0
+        ), "The `FlatParameter` is not the sole occupant of its storage"
+        orig_storage = flat_param.storage()
+        local_shard, numel_padded = FlatParamHandle._get_shard(
+            flat_param, self.rank, self.world_size
+        )
+        flat_param.set_(local_shard)  # type: ignore[call-overload]
+        self._init_shard_metadata(local_shard.numel(), numel_padded, self.rank)
+        if orig_storage.size() > 0:
+            orig_storage.resize_(0)
 
-    def init_shard_metadata(
+    def _init_shard_metadata(
         self,
         sharded_flat_param_numel: int,
         numel_padded: int,
@@ -421,9 +440,7 @@ def init_shard_metadata(
         (
             self.flat_param._shard_param_offsets,  # type: ignore[attr-defined]
             self.flat_param._shard_indices,  # type: ignore[attr-defined]
-        ) = self._get_shard_metadata(
-            start, end
-        )
+        ) = self._get_shard_metadata(start, end)
         self.flat_param._shard_numel_padded = numel_padded  # type: ignore[attr-defined]
 
     def _get_shard_metadata(
@@ -575,6 +592,408 @@ def shard_metadata(
             self.flat_param._shard_param_offsets[:],  # type: ignore[attr-defined]
         )
 
+    ###################
+    # UNSHARD/RESHARD #
+    ###################
+    def pre_unshard(self) -> bool:
+        """
+        Returns: ``False`` if this is a no-op and ``True`` otherwise.
+
+        Postcondition: ``self.flat_param`` 's data is on the device for
+        communication and is what should be all-gathered. This means that it
+        matches the dtype of the expected unsharded parameter.
+        """
+        ret = False
+        if (
+            self.uses_sharded_strategy
+            and not self._config.offload_params
+            and not self.needs_unshard()
+        ):
+            pass  # no-op
+        elif self._uses_param_mixed_precision and not self._force_full_precision:
+            self._use_low_precision_shard()
+            ret = True
+        elif self._config.offload_params and self.flat_param.device != self.device:
+            # NOTE: This creates a new tensor distinct from any attributes.
+            self._flat_param_to(self.device, non_blocking=True)
+            ret = True
+        self._check_on_compute_device(self.flat_param)
+        return ret
+
+    def _use_low_precision_shard(self):
+        """
+        Allocates the low precision shard directly on the compute device and
+        switches to using the low precision sharded flattened parameter.
+        """
+        self._check_low_precision_shard()
+        flat_param = self.flat_param
+        _alloc_storage(
+            flat_param._mp_shard, flat_param._local_shard.size()  # type: ignore[attr-defined]
+        )
+        # `copy_()` implicitly casts to the low precision
+        flat_param._mp_shard.copy_(  # type: ignore[attr-defined]
+            flat_param._local_shard.to(  # type: ignore[attr-defined]
+                self.device, non_blocking=True
+            )
+        )
+        # Invariant: `_mp_shard` is always on the compute device.
+        flat_param.data = flat_param._mp_shard  # type: ignore[attr-defined]
+
+    def unshard(self):
+        """
+        Runs the unshard logic. This includes all-gathering the flattened
+        parameter and switching to using the unsharded flattened parameter. If
+        the handle does not need unsharding, then this only switches to using
+        the unsharded flattened parameter. For ``NO_SHARD``, this is a no-op.
+
+        If FSDP is in :meth:`summon_full_params` and the handle uses parameter
+        mixed precision, then the parameter is forced to full precision.
+        """
+        if not self.needs_unshard():
+            if self.uses_sharded_strategy:
+                # The handle may have been resharded without freeing the padded
+                # unsharded flattened parameter, in which case we need to
+                # switch to using the unsharded parameter
+                unsharded_flat_param = self._get_padded_unsharded_flat_param()
+                self._use_unsharded_flat_param(unsharded_flat_param)
+            return
+        unsharded_flat_param = self._alloc_padded_unsharded_flat_param()
+        self._all_gather_flat_param(unsharded_flat_param)
+
+    def needs_unshard(self) -> bool:
+        """Returns if the handle's flattened parameter needs to be unsharded."""
+        if not self.uses_sharded_strategy:
+            return False
+        unsharded_flat_param = self._get_padded_unsharded_flat_param()
+        already_unsharded = (
+            unsharded_flat_param.storage().size() == unsharded_flat_param.numel()
+        )
+        return not already_unsharded
+
+    def _alloc_padded_unsharded_flat_param(self):
+        """
+        Allocates the *padded* unsharded flattened parameter. The unpadded
+        unsharded flattened parameter is always a view into the padded one.
+        This padded parameter is saved to a different attribute on the
+        ``FlatParameter`` depending on if we force full precision.
+        """
+        self._check_sharded_strategy()
+        flat_param = self.flat_param
+        unsharded_flat_param = self._get_padded_unsharded_flat_param()
+        self._check_storage_freed(unsharded_flat_param)
+        _alloc_storage(unsharded_flat_param, flat_param._padded_unsharded_size)  # type: ignore[attr-defined]
+        return unsharded_flat_param
+
+    def _get_padded_unsharded_flat_param(self) -> torch.Tensor:
+        """
+        Returns a reference to the padded unsharded flattened parameter
+        depending on the calling context. This should only be called if using a
+        sharded strategy.
+        """
+        self._check_sharded_strategy()
+        flat_param = self.flat_param
+        if self._force_full_precision:
+            # When parameter mixed precision is enabled, we use a different
+            # tensor as the all-gather destination to preserve the invariant
+            # that  `_full_param_padded` is in the low precision
+            unsharded_flat_param = flat_param._full_prec_full_param_padded  # type: ignore[attr-defined]
+            p_assert(
+                unsharded_flat_param.dtype != self._config.param_dtype,
+                f"Expects full precision but got {self._config.param_dtype}",
+            )
+        else:
+            unsharded_flat_param = flat_param._full_param_padded  # type: ignore[attr-defined]
+        return unsharded_flat_param
+
+    def _all_gather_flat_param(
+        self,
+        padded_unsharded_flat_param: Tensor,
+    ) -> None:
+        """
+        All-gathers the handle's flattened parameter to the destination
+        ``padded_unsharded_flat_param``, and switches to using the all-gathered
+        tensor.
+        """
+        p_assert(
+            hasattr(self, "process_group") and hasattr(self, "world_size"),
+            "Expects a process group and world size to have been set via `shard()`",
+        )
+        sharded_flat_param = self.flat_param.data
+        expected_numel = sharded_flat_param.numel() * self.world_size
+        p_assert(
+            padded_unsharded_flat_param.numel() == expected_numel,
+            f"Expects {expected_numel} numel but got {padded_unsharded_flat_param.numel()}",
+        )
+        dist._all_gather_base(
+            padded_unsharded_flat_param,
+            sharded_flat_param,
+            self.process_group,
+        )
+        self._use_unsharded_flat_param(padded_unsharded_flat_param)
+
+    def _use_unsharded_flat_param(
+        self,
+        padded_unsharded_flat_param: torch.Tensor,
+    ) -> None:
+        """
+        Switches to using the *unpadded* unsharded flattened parameter, which
+        is a view into the *padded* unsharded flattened parameter.
+        """
+        unsharded_size = self.flat_param._unpadded_unsharded_size
+        self.flat_param.data = padded_unsharded_flat_param[
+            : unsharded_size.numel()
+        ].view(unsharded_size)
+
+    def post_unshard(self):
+        """
+        Runs the post-unshard logic. This includes freeing the low precision
+        shard if needed.
+        """
+        if self._uses_param_mixed_precision and self.uses_sharded_strategy:
+            self._free_low_precision_sharded_param()
+        self._check_on_compute_device(self.flat_param)
+
+    def _free_low_precision_sharded_param(self):
+        """Frees the low precision sharded flattened parameter."""
+        self._check_low_precision_shard()
+        _free_storage(self.flat_param._mp_shard)  # type: ignore[attr-defined]
+
+    def prepare_gradient(self):
+        """
+        Prepares the gradient for the backward computation by saving and
+        clearing any existing sharded gradient in ``.grad`` to enable computing
+        a new unsharded gradient.
+        """
+        p_assert(
+            self._training_state
+            in (HandleTrainingState.BACKWARD_PRE, HandleTrainingState.IDLE),
+            "Expects to be in `BACKWARD_PRE` or `IDLE` (if prefetching)",
+        )
+        flat_param = self.flat_param
+        if flat_param.grad is not None and (
+            flat_param.grad.size() != flat_param._unpadded_unsharded_size
+            or flat_param.grad.device != flat_param.device  # grad on CPU
+        ):
+            self._check_on_compute_device(self.flat_param)
+            grad_offloaded = flat_param.grad.device != self.device
+            p_assert(
+                not grad_offloaded or self._config.offload_params,
+                f"Expects the sharded gradient to be on {self.device} "
+                f"but got {flat_param.grad.device}",
+            )
+            prev_iter_synced_gradients = (
+                flat_param.grad.size()
+                == flat_param._local_shard.size()  # type: ignore[attr-defined]
+            )
+            if prev_iter_synced_gradients:
+                # TODO (awgu): Gradient accumulation outside `no_sync()`
+                # does not work with CPU offloading. The issue should be
+                # that, in the post-backward hook, we cannot do an addition
+                # between a CPU tensor (the existing sharded gradient) and
+                # a GPU tensor (the new sharded gradient).
+                if not grad_offloaded:
+                    flat_param._saved_grad_shard = flat_param.grad.data  # type: ignore[attr-defined]
+            else:
+                padded_unsharded_size = flat_param._padded_unsharded_size  # type: ignore[attr-defined]
+                p_assert(
+                    flat_param.grad.size() == padded_unsharded_size,
+                    "Expects `.grad` to be the unsharded gradient in "
+                    f"`no_sync()` with size {padded_unsharded_size} "
+                    f"but got size {flat_param.grad.size()}",
+                )
+            flat_param.grad = None
+
+    @contextlib.contextmanager
+    def to_cpu(self):
+        """
+        Moves the unpadded unsharded flattened parameter to CPU while in the
+        context and moves it back to the previous device upon exit. For now,
+        this assumes the ``FlatParameter`` is the unpadded unsharded flattened
+        parameter since (1) there is no reason to include the padding in the
+        copy and (2) there is no use case for the sharded flattened parameter.
+
+        Precondition: ``self.flat_param`` 's data is the unpadded unsharded
+        flattened parameter on the compute device, and the handle uses a
+        sharded strategy.
+        Postcondition: Same as the precondition.
+        """
+        self._check_sharded_strategy()
+        p_assert(
+            self.flat_param.size() == self.flat_param._unpadded_unsharded_size,
+            f"Expects size {self.flat_param._unpadded_unsharded_size} but got {self.flat_param.size()}",
+        )
+        self._check_on_compute_device(self.flat_param)
+        # Check that the unpadded unsharded flattened parameter is a view into
+        # the padded unsharded flattened parameter as expected
+        # NOTE: This check is not strictly needed for correctness but is a
+        # useful sanity check since the tensor should only be used internally.
+        unpadded_storage_ptr = self.flat_param.storage().data_ptr()
+        padded_storage_ptr = (
+            self._get_padded_unsharded_flat_param().storage().data_ptr()
+        )
+        p_assert(
+            unpadded_storage_ptr == padded_storage_ptr,
+            "Expects the unpadded parameter to be a view into the padded parameter",
+        )
+        self._flat_param_to(torch.device("cpu"))
+        self._free_unsharded_flat_param()
+        try:
+            yield
+        finally:
+            p_assert(
+                self.flat_param.size() == self.flat_param._unpadded_unsharded_size,
+                f"Expects size {self.flat_param._unpadded_unsharded_size} but got {self.flat_param.size()}",
+            )
+            padded_unsharded_flat_param = self._alloc_padded_unsharded_flat_param()
+            # Copy from CPU to the compute device
+            padded_unsharded_flat_param[: self.flat_param.numel()].copy_(
+                self.flat_param
+            )
+            self._use_unsharded_flat_param(padded_unsharded_flat_param)
+
+    def reshard(self, free_unsharded_flat_param: bool):
+        """
+        Runs the reshard logic. This includes freeing the unsharded flattened
+        parameter if ``free_unsharded_flat_param`` and switching to using the
+        sharded flattened parameter.
+        """
+        if free_unsharded_flat_param:
+            self._free_unsharded_flat_param()
+        self._use_sharded_flat_param()
+
+    def post_reshard(self):
+        """
+        Runs the post-reshard logic. This includes freeing any memory that
+        can now be freed given that the ``FlatParameter`` points to the full
+        precision sharded flattened parameter.
+
+        Precondition: ``self.flat_param`` 's data points to the full precision
+        sharded flattened parameter.
+        """
+        # For `NO_SHARD`, `_mp_shard` is not freed in the post-unshard since
+        # it is also the low precision *unsharded* flattened parameter. Hence,
+        # we delay the free until the reshard.
+        if (
+            self._uses_param_mixed_precision
+            and not self.uses_sharded_strategy
+            and not self._force_full_precision  # did not use the low precision shard
+        ):
+            self._free_low_precision_sharded_param()
+
+    def _free_unsharded_flat_param(self):
+        """
+        Frees the padded unsharded flattened parameter. The tensor to free
+        depends on the calling context since the unshard may have forced full
+        precision, in which case a different tensor is used.
+        """
+        self._check_sharded_strategy()
+        unsharded_flat_param = self._get_padded_unsharded_flat_param()
+        self._check_storage_allocated(unsharded_flat_param)
+        self._check_on_compute_device(unsharded_flat_param)
+        # Do not free the memory until all ops in the current stream finish
+        unsharded_flat_param.record_stream(
+            cast(torch._C.Stream, torch.cuda.current_stream())
+        )
+        _free_storage(unsharded_flat_param)
+
+    def _use_sharded_flat_param(self) -> None:
+        """Switches to using the sharded flattened parameter."""
+        flat_param = self.flat_param
+        if self._config.offload_params:
+            device = flat_param._local_shard.device  # type: ignore[attr-defined]
+            p_assert(
+                device == torch.device("cpu"),
+                f"Expects the local shard to be on CPU but got {device}",
+            )
+        flat_param.data = flat_param._local_shard  # type: ignore[attr-defined]
+
+    #########
+    # VIEWS #
+    #########
+    @staticmethod
+    def _get_unflat_views(
+        flat_param: FlatParameter,
+        tensor: Optional[torch.Tensor] = None,
+    ) -> Iterator[Tensor]:
+        """
+        Returns unflattened ``Tensor`` views into ``tensor`` if it is not
+        ``None`` or ``flat_param`` otherwise, where the unflattening is based
+        on ``flat_param`` 's metadata.
+
+        In other words, to get views into the unsharded flattened parameter,
+        pass ``tensor`` as ``None``, but to get views into tensor optimizer
+        state, pass ``tensor`` as the optimizer state tensor.
+        """
+        if tensor is None:
+            tensor = flat_param
+        p_assert(
+            tensor.numel() == flat_param._unpadded_unsharded_size.numel(),
+            f"Expects {flat_param._unpadded_unsharded_size.numel()} numel but got "
+            f"{tensor.numel()} numel",
+        )
+        views = (
+            subtensor.view(shape)
+            for (subtensor, shape) in zip(
+                torch.split(tensor, flat_param._numels, dim=0), flat_param._shapes  # type: ignore[arg-type]
+            )
+        )
+        return views
+
+    def _unflatten(self, as_params: bool) -> None:
+        """
+        Unflattens the unsharded flattened parameter by setting the original
+        module parameter variables to be views into it.
+
+        Args:
+            as_params (bool): If ``True``, then registers the original
+                parameters as ``nn.Parameter`` s; if ``False``, then registers
+                the original parameters only as ``Tensor`` s. ``False`` should
+                be used during forward/backward computation and when hiding the
+                original parameters from :meth:`nn.Module.named_parameters`.
+        """
+        views = self._get_unflat_views(self.flat_param)
+        for view, (param_name, module, _) in zip(views, self.flat_param._param_infos):
+            if hasattr(module, param_name):
+                delattr(module, param_name)
+            if as_params:
+                module.register_parameter(param_name, nn.Parameter(view))
+            else:
+                setattr(module, param_name, view)
+        for (
+            param_name,
+            module,
+            _,
+            prim_param_name,
+            prim_module,
+            _,
+        ) in self.flat_param._shared_param_infos:
+            if hasattr(module, param_name):
+                delattr(module, param_name)
+            assert hasattr(prim_module, prim_param_name)
+            param: Union[Tensor, nn.Parameter] = getattr(prim_module, prim_param_name)
+            if as_params:
+                assert isinstance(param, nn.Parameter)
+                module.register_parameter(param_name, param)
+            else:
+                setattr(module, param_name, param)
+
+    @contextlib.contextmanager
+    def unflatten_as_params(self) -> Generator:
+        """
+        Assumes the flattened parameter is unsharded. When in the context,
+        unflattens the original parameters as ``nn.Parameter`` views into the
+        flattened parameter, and after the context, restores the original
+        parameters as ``Tensor`` views into the flattened parameter.
+        """
+        self._unflatten(as_params=True)
+        try:
+            yield
+        finally:
+            self._unflatten(as_params=False)
+
+    ###########
+    # HELPERS #
+    ###########
     def _flat_param_to(self, *args, **kwargs):
         """Wraps an in-place call to ``.to()`` for ``self.flat_param``."""
         self.flat_param.data = self.flat_param.to(*args, **kwargs)
@@ -602,3 +1021,61 @@ def parameter_module_names(self) -> Iterator[Tuple[str, str]]:
             self.flat_param._param_infos, shared_param_infos
         ):
             yield (param_name, module_name)
+
+    #######################
+    # CHECKS & INVARIANTS #
+    #######################
+    def _check_sharded_strategy(self):
+        p_assert(self.uses_sharded_strategy, "Expects sharded strategy")
+
+    def _check_on_compute_device(self, tensor: Tensor):
+        p_assert(
+            tensor.device == self.device,
+            f"Expects tensor to be on the compute device {self.device}",
+        )
+
+    @staticmethod
+    def _check_storage_freed(tensor: Tensor):
+        storage_size: int = tensor.storage().size()
+        p_assert(
+            storage_size == 0,
+            f"Expects storage to be freed but got storage with size {storage_size}",
+        )
+
+    @staticmethod
+    def _check_storage_allocated(tensor: Tensor):
+        storage_size: int = tensor.storage().size()
+        p_assert(storage_size > 0, "Expects storage to be allocated")
+
+    def _check_low_precision_shard(self):
+        p_assert(
+            self._uses_param_mixed_precision,
+            "Not using low precision for parameters",
+        )
+        p_assert(
+            getattr(self.flat_param, "_mp_shard", None) is not None,
+            "Expects `_mp_shard` to exist",
+        )
+        device = self.flat_param._mp_shard.device  # type: ignore[attr-defined]
+        p_assert(
+            device == self.device,
+            f"Expects the low precision shard to be on {self.device} but got {device}",
+        )
+
+    ##############
+    # PROPERTIES #
+    ##############
+    @property
+    def uses_sharded_strategy(self) -> bool:
+        return self._config.sharding_strategy != HandleShardingStrategy.NO_SHARD
+
+    @property
+    def _uses_param_mixed_precision(self) -> bool:
+        return self._config.param_dtype is not None
+
+    @property
+    def _force_full_precision(self) -> bool:
+        return (
+            self._training_state == HandleTrainingState.SUMMON_FULL_PARAMS
+            and self._uses_param_mixed_precision
+        )
diff --git a/torch/distributed/fsdp/flatten_params_wrapper.py b/torch/distributed/fsdp/flatten_params_wrapper.py
index 51a8455d2886..a985b456d863 100644
--- a/torch/distributed/fsdp/flatten_params_wrapper.py
+++ b/torch/distributed/fsdp/flatten_params_wrapper.py
@@ -9,10 +9,11 @@
 import contextlib
 from typing import Any, Dict, Generator, List
 
+import torch
 import torch.nn as nn
 from torch.distributed.utils import _replace_by_prefix
 
-from .flat_param import FlatParamHandle
+from .flat_param import FlatParamHandle, HandleConfig
 
 FLAT_PARAM = "flat_param"
 FPW_MODULE = "_fpw_module"
@@ -68,6 +69,10 @@ class FlattenParamsWrapper(nn.Module):
         module (nn.Module): Module to wrap.
         params (List[nn.Parameter]): Parameters in ``module`` 's subtree to
             flatten into a single flattened parameter.
+        device (torch.device): The compute and communication device for this
+            wrapper's handle.
+        config (HandleConfig): A config customizing this wrapper's handle based
+            on FSDP's available features.
 
     Attributes:
         flat_param (Optional[FlatParameter]): The flattened parameter.
@@ -82,6 +87,8 @@ def __init__(
         self,
         module: nn.Module,
         params: List[nn.Parameter],
+        device: torch.device,
+        config: HandleConfig,
     ) -> None:
         super().__init__()
         self._fpw_module = module
@@ -93,7 +100,7 @@ def __init__(
         self._register_load_state_dict_pre_hook(_pre_load_state_dict_hook)
         if len(params) == 0:
             return
-        self._flat_param_handle = FlatParamHandle(params, module)
+        self._flat_param_handle = FlatParamHandle(params, module, device, config)
         # Defining `self.flat_param` registers the `FlatParameter` and makes it
         # visible to `named_parameters()`
         self.flat_param = self._flat_param_handle.flat_param
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 4f0a33548c14..198d464d3aa5 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -1,3 +1,4 @@
+import collections
 import contextlib
 import copy
 import functools
@@ -11,6 +12,7 @@
 from typing import (
     Any,
     Callable,
+    Deque,
     Dict,
     Generator,
     Iterable,
@@ -57,22 +59,30 @@
     _broadcast_processed_optim_state_dict,
     _flatten_optim_state_dict,
     _get_param_id_to_param,
+    _get_param_id_to_param_from_optim_input,
     _get_param_to_param_id,
+    _get_param_to_param_id_from_optim_input,
     _optim_state_dict,
     _process_pos_dim_tensor_state,
     _rekey_sharded_optim_state_dict,
 )
 from ._shard_utils import _create_chunk_sharded_tensor
 from ._utils import (
-    _alloc_storage,
     _apply_to_modules,
     _apply_to_tensors,
     _contains_batchnorm,
     _free_storage,
+    _is_fsdp_flattened,
     _override_batchnorm_mixed_precision,
     p_assert,
 )
-from .flat_param import FlatParameter, FlatParamHandle
+from .flat_param import (
+    FlatParameter,
+    FlatParamHandle,
+    HandleConfig,
+    HandleShardingStrategy,
+    HandleTrainingState,
+)
 from .flatten_params_wrapper import (
     FLAT_PARAM,
     FPW_MODULE,
@@ -115,22 +125,6 @@
 
 _PARAM_BROADCAST_BUCKET_SIZE = int(250 * 1024 * 1024)
 
-def _default_meta_device_init_fn(module):
-    """
-    Default initializer for modules initialized on the meta device.
-    """
-    # TODO: move module to device_id here once device_id is available.
-    module.to_empty(device=torch.cuda.current_device())
-    try:
-        with torch.no_grad():
-            module.reset_parameters()
-    except BaseException as e:
-        warnings.warn(
-            f"Unable to call reset_parameters() for module on meta device with error {str(e)}. "
-            "Please ensure your module implements a ``reset_parameters`` function."
-        )
-        raise e
-
 
 class ShardingStrategy(Enum):
     """
@@ -367,6 +361,12 @@ class OptimStateKeyType(Enum):
     PARAM_ID = auto()
 
 
+# A handles key represents the group of `FlatParamHandle`s involved in a given
+# module's forward. These will be all-gathered together in the pre-forward and
+# pre-backward.
+_HandlesKey = Tuple[FlatParamHandle, ...]
+
+
 class _ExecOrderWarnStatus(Enum):
     """Used internally for execution order validation."""
     NONE = auto()     # no deviation yet
@@ -374,92 +374,357 @@ class _ExecOrderWarnStatus(Enum):
     WARNED = auto()   # deviated in a previous iteration
 
 
-class _ExecOrderData():
+class _ExecOrderData:
     """
-    This contains the data used for validating execution order across ranks.
-
-    Attributes:
-        _all_flat_params (List[FlatParameter]): A :class:`list` of all
-            flattened parameters contained in the FSDP module hierarchy with
-            the list index implicitly giving a unique parameter index.
-        _param_to_unflat_param_names (Dict[FlatParameter, List[str]]): A
-            mapping from flattened parameter to the comprising unflattened
-            parameters' names.
-        is_first_iter (bool): Whether executing in the first iteration or not.
-        param_order (List[int]): Order that parameters participate in the
-            forward pass; constructed on the first iteration and validated
-            against in subsequent iterations.
-        index (int): Index tracking the position in ``param_order``
-            when validating the forward pass execution order in subsequent
-            iterations.
-        warn_status (_ExecOrderWarnStatus): To avoid flooding the console, we
-            only issue warnings throughout the first deviating iteration and no
-            longer check thereafter; this tracks the warning status.
+    This contains the data structures to track the execution order. We track
+    the pre-forward order on the *first* iteration for forward prefetching
+    (which thus assumes static graph) and the post-forward order on *every*
+    iteration for backward prefetching (which thus does not assume static
+    graph but may be provide an incorrect order).
     """
-    def __init__(self) -> None:
-        self._all_flat_params: List[FlatParameter] = []
-        self._param_to_unflat_param_names: Dict[FlatParameter, List[str]] = []
-        # Modified in the first iteration:
-        self.is_first_iter: bool = True
-        self.param_order: List[int] = []
-        # Modified in the subsequent iterations:
-        self.index: int = 0
-        self.warn_status: _ExecOrderWarnStatus = _ExecOrderWarnStatus.NONE
-
-    def init(self, root_module: "FullyShardedDataParallel"):
-        assert root_module._is_root, "This data structure should only be " \
-            "initialized on an FSDP root module"
-        # Save all `FlatParameter`s in `root_module`'s hierarchy to
-        # `_all_flat_params` instead of re-materializing each time to avoid the
-        # result depending on the calling context (e.g. when some parameters
-        # have been rebuilt)
-        self._all_flat_params = [
-            param for param in root_module.parameters()
-            if isinstance(param, FlatParameter)
-        ]
-        self._param_to_unflat_param_names = cast(
+
+    def __init__(self, debug_level: dist.DebugLevel) -> None:
+        # Tracks the pre-forward order for post-backward prefetching
+        self.handles_pre_forward_order: List[int] = []
+        # Maps each handles key to its index in `handles_pre_forward_order`
+        self.handles_to_pre_forward_order_index: Dict[_HandlesKey, int] = {}
+        # Tracks the post-forward order for pre-backward prefetching
+        self.handles_post_forward_order: List[int] = []
+        # Maps each handles key to its index in `handles_post_forward_order`
+        self.handles_to_post_forward_order_index: Dict[_HandlesKey, int] = {}
+        self.is_first_iter = True
+
+        # Data structures for execution order validation
+        self._checking_order: bool = (
+            debug_level in [dist.DebugLevel.INFO, dist.DebugLevel.DETAIL]
+        )
+        self.process_group: Optional[dist.ProcessGroup] = None
+        self.world_size: Optional[int] = None
+        self.all_handles: List[FlatParamHandle] = []
+        # Maps each handle to its index in `all_handles`, which must be the
+        # same across ranks for the execution order validation to work
+        self.handle_to_handle_index: Dict[FlatParamHandle, int] = {}
+        # Names are prefixed from the root module
+        self.flat_param_to_prefixed_param_names: Dict[FlatParameter, List[str]] = {}
+        # Current index in the pre-forward execution order
+        self.current_order_index = 0
+        self.warn_status = _ExecOrderWarnStatus.NONE
+
+    def init(
+        self,
+        fsdp_root: "FullyShardedDataParallel",
+        process_group: dist.ProcessGroup,
+    ) -> None:
+        """
+        Initializes the data structures needed for checking the forward order.
+        This should be called after a root FSDP instance has been set during
+        lazy initialization.
+        """
+        self.process_group = process_group
+        self.rank = process_group.rank()
+        self.world_size = process_group.size()
+        # Fix an order over the handles, which should be the same across ranks
+        for fsdp_module in fsdp_root.fsdp_modules(fsdp_root):
+            for handle in fsdp_module._handles:
+                index = len(self.all_handles)
+                self.all_handles.append(handle)
+                self.handle_to_handle_index[handle] = index
+        self.flat_param_to_prefixed_param_names = cast(
             Dict[FlatParameter, List[str]],
-            _get_param_to_unflat_param_names(root_module)
+            _get_param_to_unflat_param_names(fsdp_root),
         )
+        # TODO (awgu): We can broadcast the metadata of rank 0's `all_handles`
+        # to check that all ranks have the same handles in the same order.
+        # https://github.com/pytorch/pytorch/issues/79620
 
-    def get_param_index(self, param: FlatParameter) -> int:
-        """Returns a unique non-negative parameter index for ``param`` if it is
-        valid or -1 otherwise. Critically, this index assignment must be the
-        same across ranks."""
-        assert isinstance(param, FlatParameter), \
-            f"Expects `param` is a `FlatParameter` but got {type(param)}"
-        for i, p in enumerate(self._all_flat_params):
-            if p is param:
-                return i
-        return -1
-
-    def get_param(self, param_index: int) -> Optional[FlatParameter]:
-        """Returns the parameter corresponding to ``param_index`` or ``None``
-        if the index is invalid."""
-        for i, p in enumerate(self._all_flat_params):
-            if i == param_index:
-                return p
-        return None
+    def get_handles_to_backward_prefetch(
+        self,
+        current_handles_key: _HandlesKey,
+    ) -> Optional[_HandlesKey]:
+        """
+        Returns the handles key of the handles to backward prefetch given the
+        current handles key or ``None`` if there is no valid handles key to
+        prefetch.
+        """
+        current_index = self.handles_to_post_forward_order_index.get(current_handles_key, None)
+        if current_index is None:
+            return None
+        target_index = current_index - 1
+        if target_index < 0:
+            return None
+        target_handles_key = self.handles_post_forward_order[target_index]
+        return target_handles_key
 
-    def get_unflat_param_names(self, param_index: int) -> List[str]:
-        """Returns a :class:`list` of unflattened parameter names comprising
-        the flattened parameter with index ``param_index`` or an empty
-        :class:`list` if ``param_index`` is invalid."""
-        param = self.get_param(param_index)
-        if param is None:
-            return []
-        assert param in self._param_to_unflat_param_names, \
-            "Internal data structures out of sync; check `init()`"
-        return self._param_to_unflat_param_names[param]
-
-    def reset(self):
-        """Called in :meth:`_wait_for_post_backward` to reset data for the next
-        iteration."""
+    def record_post_forward(self, handles: List[FlatParamHandle]) -> None:
+        """
+        Records ``handles`` in the post-forward order, where ``handles`` should
+        be a group of handles used in the same module's forward. If ``handles``
+        is empty, then it is omitted.
+
+        Unlike :meth:`record_pre_forward`, this records the order *every*
+        iteration with the expectation that the recorded order is reset in
+        :meth:`next_iter`.
+        """
+        if not handles:
+            return
+        handles_key = tuple(handles)
+        # Only record the first usage of a handles key
+        if handles_key in self.handles_to_post_forward_order_index:
+            return
+        index = len(self.handles_post_forward_order)
+        self.handles_to_post_forward_order_index[handles_key] = index
+        self.handles_post_forward_order.append(handles_key)
+
+    def record_pre_forward(self, handles: List[FlatParamHandle], is_training: bool) -> None:
+        """
+        Records ``handles`` in the pre-forward order on the first iteration,
+        where ``handles`` should be a group of handles used in the same
+        module's forward. If ``handles`` is empty, then it is omitted.
+
+        On the first iteration, this checks the execution order across ranks.
+        See :meth:`_check_order` for details.
+        """
+        if not handles:
+            return
+        handles_key = tuple(handles)
+        self._check_order(handles_key, is_training)
+        # Fix the order after the first iteration and only record the first
+        # usage of a handles key
+        if (
+            not self.is_first_iter
+            or handles_key in self.handles_to_pre_forward_order_index
+        ):
+            return
+        index = len(self.handles_pre_forward_order)
+        self.handles_to_pre_forward_order_index[handles_key] = index
+        self.handles_pre_forward_order.append(handles_key)
+
+    def _check_order(self, handles_key: _HandlesKey, is_training: bool) -> None:
+        """
+        Checks the forward execution order as long as ``is_training`` is
+        ``True`` since checking in eval mode is not supported.
+
+        - On the first iteration, this uses all-gathers to check that all ranks
+        are all-gathering the same handles and hence ``FlatParameter`` s,
+        raising an error if not.
+        - On subsequent iterations, if the distributed debug level is at least
+        INFO, then this checks that each rank is locally consistent with its
+        own forward order from the first iteration, issuing a warning if not.
+        This issues a warning on the first deviating iteration and stops
+        warning thereafter.
+        """
+        # Do not check order in eval mode since the post-backward callback does
+        # not run so it cannot be used to mark the end of an iteration
+        if not is_training:
+            return
+        if self.is_first_iter:
+            msg_prefix = "Forward order differs across ranks:"
+            local_indices: Optional[Tuple[int, ...]] = self._get_handle_indices(
+                handles_key
+            )
+            device = handles_key[0].device  # guaranteed to be non-CPU
+            num_valid_indices = sum((index is not None) for index in local_indices)
+            tensor_kwargs = {"dtype": torch.int32, "device": device}
+            world_num_valid_indices = torch.zeros(self.world_size, **tensor_kwargs)
+            local_num_valid_indices = torch.tensor([num_valid_indices], **tensor_kwargs)
+            dist._all_gather_base(
+                world_num_valid_indices,
+                local_num_valid_indices,
+                group=self.process_group,
+            )
+            # Check that all ranks plan to all-gather the same number of
+            # parameters
+            # TODO (awgu): Since every module has at most one handle in the
+            # current implementation, this should never raise the error.
+            for (r1, n1), (r2, n2) in itertools.combinations(
+                (
+                    (rank, world_num_valid_indices[rank])
+                    for rank in range(self.world_size)
+                ),
+                2,
+            ):
+                if n1 != n2:
+                    raise RuntimeError(
+                        f"{msg_prefix} rank {r1} is all-gathering {n1} parameters "
+                        f"while rank {r2} is all-gathering {n2} parameters"
+                    )
+            world_indices = torch.zeros(
+                self.world_size * num_valid_indices, **tensor_kwargs
+            )
+            local_indices = torch.tensor(local_indices, **tensor_kwargs)
+            dist._all_gather_base(
+                world_indices, local_indices, group=self.process_group
+            )
+            # Check that all ranks plan to all-gather the same index parameters
+            for (r1, i1), (r2, i2) in itertools.combinations(
+                (
+                    (
+                        rank,
+                        world_indices[
+                            rank * num_valid_indices : (rank + 1) * num_valid_indices
+                        ],
+                    )
+                    for rank in range(self.world_size)
+                ),
+                2,
+            ):
+                if i1 != i2:
+                    r1_param_names = self._get_names_from_handle_indices(i1)
+                    r2_param_names = self._get_names_from_handle_indices(i2)
+                    raise RuntimeError(
+                        f"{msg_prefix} rank {r1} is all-gathering parameters "
+                        f"for {r1_param_names} while rank {r2} is all-gathering "
+                        f"parameters for {r2_param_names}"
+                    )
+        elif self._checking_order:
+            # Only issue warnings on the first deviating iteration and stop
+            # checking thereafter to avoid flooding the console
+            if self.warn_status == _ExecOrderWarnStatus.WARNED:
+                return
+            msg_prefix = None  # non-`None` means we should warn
+            if self.current_order_index >= len(self.handles_pre_forward_order):
+                # This iteration sees extra all-gather(s) compared to the first
+                msg_prefix = (
+                    "Expected to not all-gather any more parameters in the "
+                    "forward but trying to all-gather parameters for "
+                )
+            else:
+                expected_handles_key = self.handles_pre_forward_order[
+                    self.current_order_index
+                ]
+                if expected_handles_key != handles_key:
+                    expected_param_names = self._get_names_from_handles(
+                        expected_handles_key
+                    )
+                    msg_prefix = (
+                        f"Expected to all-gather for {expected_param_names} "
+                        "but trying to all-gather parameters for "
+                    )
+            if msg_prefix is not None:
+                param_names = self._get_names_from_handles(handles_key)
+                msg_suffix = (
+                    f"{param_names}"
+                    if param_names
+                    else "a newly-added parameter since construction time"
+                )
+                warnings.warn(
+                    "Forward order differs from that of the first iteration "
+                    f"on rank {self.rank}. Collectives are unchecked and may "
+                    f"give incorrect results or hang.\n{msg_prefix}{msg_suffix}"
+                )
+                self.warn_status = _ExecOrderWarnStatus.WARNING
+            self.current_order_index += 1
+
+    def _get_handle_indices(
+        self,
+        handles_key: _HandlesKey,
+    ) -> Tuple[Optional[int], ...]:
+        """
+        Returns the handle indices (i.e. indices into ``self.all_handles``)
+        corresponding to the handles in ``handles_key``. An entry in the
+        returned tuple is ``None`` if the handle is invalid.
+        """
+        indices: List[int] = []
+        for handle in handles_key:
+            if handle not in self.handle_to_handle_index:
+                indices.append(None)
+            else:
+                indices.append(self.handle_to_handle_index[handle])
+        return tuple(indices)
+
+    def _get_names_from_handle_indices(
+        self,
+        handle_indices: Tuple[int, ...],
+    ) -> List[List[str]]:
+        """
+        Returns a list of prefixed parameter names for each handle in
+        ``handle_indices``. If a handle index is invalid, then its prefixed
+        parameter names are omitted from the returned list.
+        """
+        prefixed_param_names: List[List[str]] = []
+        for index in handle_indices:
+            if index is None or index < 0 or index >= len(self.all_handles):
+                continue
+            handle = self.all_handles[index]
+            flat_param = handle.flat_param
+            prefixed_param_names.append(self.flat_param_to_prefixed_param_names[flat_param])
+        return prefixed_param_names
+
+    def _get_names_from_handles(
+        self,
+        handles_key: _HandlesKey,
+    ) -> List[List[str]]:
+        """
+        Returns a list of prefixed parameter names for each handle in
+        ``handles_key``. If a handle is invalid, then its prefixed parameter
+        names are omitted from the returned list.
+        """
+        prefixed_param_names: List[List[str]] = []
+        for handle in handles_key:
+            flat_param = handle.flat_param
+            if flat_param not in self.flat_param_to_prefixed_param_names:
+                continue
+            prefixed_param_names.append(self.flat_param_to_prefixed_param_names[flat_param])
+        return prefixed_param_names
+
+    def next_iter(self):
+        """
+        Advances the internal data structures per iteration. This should be
+        called in the post-backward callback since that marks the true end of
+        an iteration.
+        """
         self.is_first_iter = False
-        self.index = 0
-        # `reset()` marks the end of an iteration, so transition if needed
-        if self.warn_status == _ExecOrderWarnStatus.WARNING:
-            self.warn_status = _ExecOrderWarnStatus.WARNED
+        self.handles_to_post_forward_order_index.clear()
+        self.handles_post_forward_order.clear()
+        if self._checking_order:
+            self.current_order_index = 0
+            if self.warn_status == _ExecOrderWarnStatus.WARNING:
+                self.warn_status = _ExecOrderWarnStatus.WARNED
+
+
+class _FreeEventQueue:
+    """
+    This tracks all pending frees corresponding to inflight all-gathers. The
+    queueing pattern is iterative enqueues followed by a flush, and the current
+    heuristic for the flush is based on the number of inflight all-gathers.
+    """
+
+    def __init__(self) -> None:
+        self._queue: Deque[torch.cuda.Event] = collections.deque()
+        self._max_num_inflight_all_gathers = 2  # empirically chosen
+
+    def enqueue(self, free_event: torch.cuda.Event) -> None:
+        """Enqueues a free event."""
+        self._queue.append(free_event)
+
+    def flush_if_needed(self) -> List[torch.cuda.Event]:
+        """
+        If the queue should be flushed (based on an internal criteria), then
+        this returns a non-empty :class:`list` of free events. Otherwise, this
+        returns an empty :class:`list`.
+        """
+        events: List[torch.cuda.Event] = []
+        if len(self._queue) >= self._max_num_inflight_all_gathers:
+            while self._queue:
+                event = self._dequeue()
+                assert event is not None
+                events.append(event)
+        return events
+
+    def _dequeue(self) -> Optional[torch.cuda.Event]:
+        """Dequeues a free event if possible."""
+        if self._queue:
+            event = self._queue.popleft()
+            return event
+        return None
+
+
+# TODO (awgu): Refactor this later
+sharding_strategy_map = {
+    ShardingStrategy.NO_SHARD: HandleShardingStrategy.NO_SHARD,
+    ShardingStrategy.FULL_SHARD: HandleShardingStrategy.FULL_SHARD,
+    ShardingStrategy.SHARD_GRAD_OP: HandleShardingStrategy.SHARD_GRAD_OP,
+}
 
 
 class FullyShardedDataParallel(nn.Module):
@@ -647,7 +912,6 @@ class FullyShardedDataParallel(nn.Module):
             will reside on this device, including moving ignored modules' parameters if
             needed. Note that if ``device_id`` is specified but ``module`` is already on a
             different CUDA device, an error will be thrown. (Default: ``None``)
-
         sync_module_states (bool): If ``True``, each individually wrapped FSDP unit will broadcast
             module parameters from rank 0 to ensure they are the same across all ranks after
             initialization. This helps ensure model parameters are the same across ranks
@@ -656,7 +920,12 @@ class FullyShardedDataParallel(nn.Module):
             This can also help load checkpoints taken by ``state_dict`` and to be loaded by
             ``load_state_dict`` in a memory efficient way. See documentation for
             :class:`FullStateDictConfig` for an example of this. (Default: ``False``)
-
+        limit_all_gathers (bool): If ``False``, then FSDP allows the CPU
+            thread to schedule all-gathers without any extra synchronization.
+            If ``True``, then FSDP explicitly synchronizes the CPU thread to
+            prevent too many in-flight all-gathers. This ``bool`` only affects
+            the sharded strategies that schedule all-gathers. Enabling this can
+            help lower the number of CUDA malloc retries.
     """
     def __init__(
         self,
@@ -672,6 +941,7 @@ def __init__(
         device_id: Optional[Union[int, torch.device]] = None,
         sync_module_states: bool = False,
         forward_prefetch: bool = False,
+        limit_all_gathers: bool = False,
     ):
         if isinstance(auto_wrap_policy, ParamExecOrderWrapPolicy):
             self._init_param_exec_order_wrap_policy(
@@ -687,6 +957,7 @@ def __init__(
                 device_id=device_id,
                 sync_module_states=sync_module_states,
                 forward_prefetch=forward_prefetch,
+                limit_all_gathers=limit_all_gathers,
             )
             return
 
@@ -712,11 +983,12 @@ def __init__(
                 "sharding_strategy": sharding_strategy,
                 "cpu_offload": cpu_offload,
                 "backward_prefetch": backward_prefetch,
-                "forward_prefetch": forward_prefetch,
                 "mixed_precision": mixed_precision,
                 "param_init_fn": param_init_fn,
                 "device_id": device_id,
                 "sync_module_states": sync_module_states,
+                "forward_prefetch": forward_prefetch,
+                "limit_all_gathers": limit_all_gathers,
             }
             self._auto_wrap(auto_wrap_kwargs, fsdp_kwargs)
 
@@ -727,13 +999,18 @@ def __init__(
         self.cpu_offload = cpu_offload or CPUOffload()
         self.backward_prefetch = backward_prefetch
         self.forward_prefetch = forward_prefetch
+        self.limit_all_gathers = limit_all_gathers
+        # We clamp the strategy to `NO_SHARD` for world size of 1 since they
+        # are currently functionally equivalent. This may change if/when we
+        # integrate FSDP with MoE.
+        if self.world_size == 1:
+            sharding_strategy = ShardingStrategy.NO_SHARD
         self.sharding_strategy = sharding_strategy or ShardingStrategy.FULL_SHARD
         self.mixed_precision = mixed_precision or MixedPrecision()
         # Save a mapping from fully prefixed buffer name to its original dtype
         # since for mixed precision, buffers are restored to their original
         # dtype for model checkpointing
         self._buffer_name_to_orig_dtype: Dict[str, torch.dtype] = {}
-        self._debug_level = dist.get_debug_level()
 
         self._check_single_device_module(module, ignored_params)
         device_from_device_id: Optional[torch.device] = self._get_device_from_device_id(device_id)
@@ -744,7 +1021,21 @@ def __init__(
         if sync_module_states:
             self._sync_module_states(module, params_to_flatten)
 
-        self._fsdp_wrapped_module = FlattenParamsWrapper(module, params_to_flatten)
+        # This FSDP instance's handles should inherit the same process group,
+        # compute device, CPU offload, and mixed precision settings. However,
+        # different sharding strategies are allowed.
+        config = HandleConfig(
+            sharding_strategy_map[self.sharding_strategy],
+            self.cpu_offload.offload_params,
+            self.mixed_precision.param_dtype,
+            self.mixed_precision.reduce_dtype,
+        )
+        self._fsdp_wrapped_module = FlattenParamsWrapper(
+            module,
+            params_to_flatten,
+            self.compute_device,
+            config,
+        )
         self._check_orig_params_flattened(ignored_params)
         # Invariant: `self.params` contains exactly the `FlatParameter`s of the
         # handles in `self._handles`
@@ -754,8 +1045,8 @@ def __init__(
             handle = self._fsdp_wrapped_module.handle
             self.params.append(handle.flat_param)
             self._register_param_handle(handle)
-            self._shard_parameters()
-            if self.cpu_offload.offload_params:
+            handle.shard(self.process_group)
+            if self.cpu_offload.offload_params and handle.flat_param.device != torch.device("cpu"):
                 with torch.no_grad():
                     handle._flat_param_to(torch.device("cpu"))
 
@@ -765,19 +1056,19 @@ def __init__(
         self._hook_registered = False
 
         # Used to prevent running the pre-backward hook multiple times
-        self._pre_backward_hook_has_run: bool = False
+        self._ran_pre_backward_hook: Dict[_HandlesKey, bool] = {}
         self._is_root: Optional[bool] = None  # `None` indicates not yet set
         # The following attributes are owned by the root FSDP instance and
         # shared with non-root FSDP instances
         self._streams: Dict[str, torch.cuda.Stream] = {}
-        self._fsdp_graph_order: List[FullyShardedDataParallel] = []
-        self._my_fsdp_idx_in_graph: Optional[int] = None
-        self._pre_backward_hook_full_params_prefetched: bool = False
-        self._forward_full_params_prefetched: bool = False
-        self._init_reshard_after_forward()
-        self._exec_order_data = _ExecOrderData()
-        # Used for `BACKWARD_POST` prefetching
-        self._need_rebuild_full_params = False
+        self._free_event_queue = _FreeEventQueue()
+        self._debug_level = dist.get_debug_level()
+        self._exec_order_data = _ExecOrderData(self._debug_level)
+        self._handles_prefetched: Dict[_HandlesKey, bool] = {}
+        # Used for guarding against mistargeted backward prefetches
+        self._needs_pre_backward_unshard: Dict[_HandlesKey, bool] = {}
+        # The data structures use tuples of handles to generalize over the case
+        # where a module's forward involves multiple handles.
 
         # `_state_dict_type` controls the `state_dict()` behavior, which is
         # implemented using post-save and pre-load hooks
@@ -863,7 +1154,7 @@ def _get_ignored_params(
             p
             for m in ignored_modules
             for p in m.parameters()
-            if not isinstance(p, FlatParameter)
+            if not _is_fsdp_flattened(p)
         )
         # Conservatively include all shared parameters' names
         param_to_unflat_param_names = _get_param_to_unflat_param_names(
@@ -917,8 +1208,8 @@ def _auto_wrap(
         ``fsdp_kwargs``.
 
         Precondition: ``auto_wrap_policy`` contains the arguments expected by
-            ``_recursive_wrap()``, where ``auto_wrap_policy`` is not ``None``.
-            ``fsdp_kwargs`` contains all FSDP arguments except ``module``.
+        ``_recursive_wrap()``, where ``auto_wrap_policy`` is not ``None``.
+        ``fsdp_kwargs`` contains all FSDP arguments except ``module``.
         """
         auto_wrap_policy = auto_wrap_kwargs["auto_wrap_policy"]
         root_module = auto_wrap_kwargs["module"]
@@ -1131,6 +1422,7 @@ def _sync_module_states(
         """
         Synchronizes module states (i.e. parameters ``params`` and all
         not-yet-synced buffers) by broadcasting from rank 0 to all ranks.
+
         Precondition: ``sync_module_states == True`` and ``self.process_group``
         has been set.
         """
@@ -1167,7 +1459,7 @@ def _get_orig_params(
         try:
             while True:
                 param = next(param_gen)
-                if param not in ignored_params and not isinstance(param, FlatParameter):
+                if param not in ignored_params and not _is_fsdp_flattened(param):
                     yield param
         except StopIteration:
             pass
@@ -1179,7 +1471,7 @@ def _check_orig_params_flattened(self, ignored_params: Set[nn.Parameter]) -> Non
         check after flattening the wrapped module's parameters.
         """
         for param_name, param in self.named_parameters():
-            if param not in ignored_params and not isinstance(param, FlatParameter):
+            if param not in ignored_params and not _is_fsdp_flattened(param):
                 raise RuntimeError(
                     f"Found an unflattened parameter: {param_name}; "
                     f"{param.size()} {param.__class__}"
@@ -1190,14 +1482,89 @@ def _register_param_handle(self, handle: FlatParamHandle) -> None:
         if handle not in self._handles:
             self._handles.append(handle)
 
+    @torch.no_grad()
+    def _unshard(
+        self,
+        handles: List[FlatParamHandle],
+    ) -> None:
+        """
+        Unshards the handles in ``handles``. If the handles are in
+        :meth:`summon_full_params` and are using mixed precision, then they are
+        forced to full precision.
+
+        Postcondition: Each handle's ``FlatParameter`` 's data is the padded
+        unsharded flattened parameter on the compute device.
+        """
+        if self.limit_all_gathers:
+            events = self._free_event_queue.flush_if_needed()
+            if events:
+                # As a minor optimization, only synchronize the latest event
+                events[-1].synchronize()
+        any_ran_pre_unshard = False
+        with torch.cuda.stream(self._streams["pre_all_gather"]):
+            for handle in handles:
+                ran_pre_unshard = handle.pre_unshard()
+                any_ran_pre_unshard = any_ran_pre_unshard or ran_pre_unshard
+        if any_ran_pre_unshard:
+            self._streams["all_gather"].wait_stream(self._streams["pre_all_gather"])
+        with torch.cuda.stream(self._streams["all_gather"]):
+            for handle in handles:
+                handle.unshard()
+                handle.post_unshard()
+
+    def _reshard(
+        self,  # unused
+        handles: List[FlatParamHandle],
+        free_unsharded_flat_params: List[bool],
+    ) -> None:
+        """
+        Reshards the handles in ``handles``. ``free_unsharded_flat_params``
+        should have the same length as ``handles``, and each element should
+        give whether the corresponding handle should free its padded unsharded
+        flattened parameter.
+        """
+        if not handles:
+            return
+        p_assert(
+            len(handles) == len(free_unsharded_flat_params),
+            "Expects both lists to have equal length but got "
+            f"{len(handles)} and {len(free_unsharded_flat_params)}"
+        )
+        for handle, free_unsharded_flat_param in zip(
+            handles,
+            free_unsharded_flat_params,
+        ):
+            handle.reshard(free_unsharded_flat_param)
+            if self.limit_all_gathers and free_unsharded_flat_param:
+                free_event = torch.cuda.Event()
+                free_event.record()
+                self._free_event_queue.enqueue(free_event)
+            handle.post_reshard()
+        # Since we prefetch entire handles keys at a time, conservatively mark
+        # the entire key as no longer prefetched once we free at least one
+        handles_key = tuple(handles)
+        if any(free_unsharded_flat_params):
+            self._handles_prefetched.pop(handles_key, None)
+
     @property
     def module(self) -> nn.Module:
-        """Make model.module accessible, just like DDP. Return the
-        underlying module without the flatten_params_wrapper
+        """
+        Returns the wrapped module (like :class:`DistributedDataParallel`).
         """
         assert isinstance(self._fsdp_wrapped_module, FlattenParamsWrapper)
         return self._fsdp_wrapped_module.module
 
+    def __getattr__(self, name: str) -> Any:
+        """Forward missing attributes to wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            return getattr(self._fsdp_wrapped_module, name)
+
+    def __getitem__(self, key: int) -> Any:
+        """Forward indexing calls in case the module is a nn.Sequential."""
+        return self._fsdp_wrapped_module.__getitem__(key)  # type: ignore[operator]
+
     def check_is_root(self) -> bool:
         self._lazy_init()
         assert self._is_root is not None
@@ -1256,46 +1623,26 @@ def apply(self, fn: Callable[[nn.Module], None]) -> "FullyShardedDataParallel":
 
         return ret
 
-    def _offload_to_cpu(self, p):
-        """
-        Offloads parameter to CPU from self.compute_device. If the parameter is
-        already on CPU then this is a noop.
-        """
-        cpu_device = torch.device("cpu")
-        if p.device == cpu_device:
-            return
-        with torch.no_grad():
-            p.data = p.to(cpu_device)
-
     def _mixed_precision_enabled_for_params(self) -> bool:
         """
         Whether user explicitly enabled mixed precision for
         parameters or not.
         """
-        return (
-            self.mixed_precision is not None
-            and self.mixed_precision.param_dtype is not None
-        )
+        return self.mixed_precision.param_dtype is not None
 
     def _mixed_precision_enabled_for_buffers(self) -> bool:
         """
         Whether user explicitly enabled mixed precision for
         buffers or not.
         """
-        return (
-            self.mixed_precision is not None
-            and self.mixed_precision.buffer_dtype is not None
-        )
+        return self.mixed_precision.buffer_dtype is not None
 
     def _mixed_precision_enabled_for_reduce(self) -> bool:
         """
         Whether user explicitly enabled mixed precision for
         gradient reduction or not.
         """
-        return (
-            self.mixed_precision is not None
-            and self.mixed_precision.reduce_dtype is not None
-        )
+        return self.mixed_precision.reduce_dtype is not None
 
     def _low_precision_hook_enabled(self) -> bool:
         """
@@ -1306,19 +1653,20 @@ def _low_precision_hook_enabled(self) -> bool:
             and self._communication_hook in LOW_PRECISION_HOOKS
         )
 
-    def _cast_fp_inputs_to_precision(
+    def _cast_fp_inputs_to_dtype(
         self, dtype: torch.dtype, *args: Any, **kwargs: Any
     ) -> Tuple[Any, Any]:
         """
-        Casts floating point tensors in args and kwargs to precision given by dtype.
-        requires_grad field is respected.
+        Casts floating point tensors in ``args`` and ``kwargs`` to the
+        precision given by ``dtype``, while respecting the existing
+        ``requires_grad`` on the tensors.
         """
         def cast_fn(x: torch.Tensor) -> torch.Tensor:
             if not torch.is_floating_point(x):
                 return x
             y = x.to(dtype)
-            # Explicitly copy over requires_grad context since this is happening
-            # within torch.no_grad.
+            # Explicitly copy over `requires_grad` since this runs inside
+            # `torch.no_grad()`
             if x.is_leaf:
                 y.requires_grad = x.requires_grad
             return y
@@ -1329,50 +1677,6 @@ def cast_fn(x: torch.Tensor) -> torch.Tensor:
                 _apply_to_tensors(cast_fn, kwargs)
             )
 
-    @torch.no_grad()
-    def _cast_param_shards_to_dtype(self):
-        """
-        Allocates a mixed precision paramter shard and casts parameter shards to
-        reduced precision by copying into this mixed precision shard. Note that
-        if we are CPU offloading, this also implicitly loads the parameter shard
-        back to GPU.
-        """
-        assert (
-            self._mixed_precision_enabled_for_params()
-        ), "Expected to only be called when mixed precision for parameters is enabled."
-        with torch.cuda.stream(self._streams["mixed_precision_params"]):
-            for p in self.params:
-                assert p._mp_shard is not None
-                _alloc_storage(p._mp_shard, size=p._local_shard.size())
-                # Cast is done by copy
-                p._mp_shard.copy_(
-                    # no-op if not CPU offloading, otherwise nonblocking because
-                    # p._local_shard is pinned in _init_param_attributes.
-                    p._local_shard.to(p._mp_shard.device, non_blocking=True)
-                )
-                # Point p to the mp shard
-                p.data = p._mp_shard
-        # Block current stream on this copy work.
-        torch.cuda.current_stream().wait_stream(self._streams["mixed_precision_params"])
-
-    @torch.no_grad()
-    def _free_mp_shard(self, params: List[FlatParameter]):
-        """
-        Deallocate storage for parameter's mixed precision shard.
-        """
-        assert (
-            self._mixed_precision_enabled_for_params()
-        ), "Expected to only be called when mixed precision for parameters is enabled."
-        current_stream = torch.cuda.current_stream()
-        for p in params:
-            # mp_shard should always be allocated.
-            assert p._mp_shard is not None
-            # Shard is allocated in "mixed_precision_stream" and then we block
-            # current stream on this stream, so don't free it until work in the
-            # current stream is completed.
-            p._mp_shard.record_stream(current_stream)
-            _free_storage(p._mp_shard)
-
     def _cast_buffers(
         self,
         device: Optional[torch.device] = None,
@@ -1434,101 +1738,17 @@ def _cast_buffers(
 
                     setattr(module, name, buf)
 
-    @torch.no_grad()
-    def _shard_parameters(self) -> None:
-        """
-        At initialization we wrap a module with full parameters and shard the
-        parameters in-place. Sharding is implemented by viewing each parameter
-        as a 1D Tensor and retaining only a single slice, where the slice size
-        is determined by the number of data parallel workers.
-        After this initial sharding is complete, the user can initialize a
-        ``torch.optim.Optimizer`` in the usual way, i.e.::
-        .. code-block:: python
-            optim = torch.optim.Adam(sharded_module.parameters(), lr=0.0001)
-        The optimizer will see only a single slice of parameters and will thus
-        allocate less memory for optimizer state, avoiding redundancy across
-        data parallel workers.
-        """
-        for handle in self._handles:
-            p = handle.flat_param
-            assert not p._is_sharded, "Param should have not been sharded yet."
-            assert (
-                p.is_floating_point()
-            ), "Autograd does not support operations for integer type."
-
-            # Sharding is done only when world_size is larger than 1 and
-            # sharding_strategy!=NO_SHARD.
-            p._is_sharded = (  # type: ignore[attr-defined]
-                self.world_size > 1
-                and self.sharding_strategy != ShardingStrategy.NO_SHARD
-            )
-
-            if not p._is_sharded:  # type: ignore[attr-defined]
-                continue
-
-            # Save the original storage and free it later on.
-            # Since we're modifying the tensor's storage directly,
-            # make sure the tensor is the sole occupant of the storage.
-            assert (
-                p.storage_offset() == 0
-            ), "The tensor is not the sole occupant of the storage."
-            orig_storage = p.storage()
-
-            # Replace p with the relevant shard.
-            local_shard, numel_padded = FlatParamHandle._get_shard(p, self.rank, self.world_size)
-            p.set_(local_shard)  # type: ignore[call-overload]
-            handle.init_shard_metadata(local_shard.numel(), numel_padded, self.rank)
-
-            # Free storage that contains the original full data.
-            if orig_storage.size() > 0:
-                orig_storage.resize_(0)  # type: ignore[attr-defined]
-
-    def __getattr__(self, name: str) -> Any:
-        """Forward missing attributes to wrapped module."""
-        try:
-            return super().__getattr__(name)  # defer to nn.Module's logic
-        except AttributeError:
-            return getattr(self._fsdp_wrapped_module, name)
-
-    def __getitem__(self, key: int) -> Any:
-        """Forward indexing calls in case the module is a nn.Sequential."""
-        return self._fsdp_wrapped_module.__getitem__(key)  # type: ignore[operator]
-
     def _reset_lazy_init(self) -> None:
         """
         Reset instance so :func:`_lazy_init` will run on the next forward.
         """
         self._is_root: Optional[bool] = None
-        self._streams: Dict[str, torch.cuda.Stream] = {}
-        self._fsdp_graph_order: List[nn.Module] = []
-        self._my_fsdp_idx_in_graph: Optional[int] = None
-        self._pre_backward_hook_full_params_prefetched: bool = False
-        self._forward_full_params_prefetched: bool = False
-
         for p in self.params:
             if hasattr(p, "_local_shard"):
-                # reset attributes that are added in _init_param_attributes, as
-                # part of _lazy_init
+                # We only need to `del` `_local_shard` because
+                # `_init_param_attributes()` gates the logic based on its
+                # existence (and not any of the other attributes).
                 del p._local_shard  # type: ignore[attr-defined]
-        # set 'self.reshard_after_forward' flag based on self.sharding_strategy
-        self._init_reshard_after_forward()
-
-    def _init_reshard_after_forward(self):
-        if self.sharding_strategy == ShardingStrategy.FULL_SHARD:
-            # Free full params and keep shard only after forward
-            self.reshard_after_forward = True
-        elif self.sharding_strategy == ShardingStrategy.SHARD_GRAD_OP:
-            # Keep full params in the GPU memory until backward
-            # computation is done
-            self.reshard_after_forward = False
-        elif self.sharding_strategy == ShardingStrategy.NO_SHARD:
-            # self.reshard_after_forward is not used when NO_SHARD
-            # is set, just setting it as False here
-            self.reshard_after_forward = False
-        else:
-            raise RuntimeError(
-                "sharding_strategy only supports FULL_SHARD, SHARD_GRAD_OP and NO_SHARD right now."
-            )
 
     def _lazy_init(self) -> None:
         """
@@ -1546,20 +1766,18 @@ def _lazy_init(self) -> None:
             # Allow the FSDP constructor to run even with CUDA but check this
             # once we start real execution
             raise RuntimeError("FSDP does not support CPU only execution")
-        # The following logic is only run on the root FSDP instance
+        # The following logic is only run on the root FSDP instance since it
+        # will set `_is_root=False` for the non-root instances
         self._is_root = True
         self._assert_state(TrainingState_.IDLE)
         self._init_streams()
         self._cast_buffers(recurse=True)
-        for param in self.params:
-            self._init_param_attributes(param)
-        # Do not reshard the root's parameters at the end of the forward pass
-        # with the intention that they are immediately used in the backward
-        # pass gradient computation (though this may not be true)
-        self.reshard_after_forward = False
-        self._exec_order_data.init(self)
+        for handle in self._handles:
+            self._init_param_attributes(handle)
+        self._exec_order_data.init(self, self.process_group)
         # Initialize non-root FSDP instances and share attributes from the root
-        # to non-root instances (e.g. streams for overlapping)
+        # to non-root instances
+        inconsistent_limit_all_gathers = False
         for fsdp_module in self.fsdp_modules(self):
             if fsdp_module is not self:
                 # Relax the assert for non-root FSDP instances in case the
@@ -1571,21 +1789,28 @@ def _lazy_init(self) -> None:
                 )
                 fsdp_module._is_root = False
                 fsdp_module._streams = self._streams
-                fsdp_module._fsdp_graph_order = self._fsdp_graph_order
                 fsdp_module._exec_order_data = self._exec_order_data
-                for param in fsdp_module.params:
-                    fsdp_module._init_param_attributes(param)
+                if fsdp_module.limit_all_gathers != self.limit_all_gathers:
+                    # Prefer the root's value
+                    inconsistent_limit_all_gathers = True
+                    fsdp_module.limit_all_gathers = self.limit_all_gathers
+                fsdp_module._free_event_queue = self._free_event_queue
+                fsdp_module._handles_prefetched = self._handles_prefetched
+                fsdp_module._needs_pre_backward_unshard = self._needs_pre_backward_unshard
+                for handle in fsdp_module._handles:
+                    fsdp_module._init_param_attributes(handle)
+        if inconsistent_limit_all_gathers:
+            warnings.warn(
+                "Found inconsistent `limit_all_gathers` values across FSDP "
+                f"instances on rank {self.rank}. Using the root FSDP's value "
+                f"of {self.limit_all_gathers} for all instances."
+            )
 
+    # TODO (awgu): Move this to the `FlatParamHandle` class later
     @torch.no_grad()
-    def _init_param_attributes(self, p: FlatParameter) -> None:
-        """
-        We manage several attributes on each Parameter instance. The first is
-        set by :func:`_shard_parameters`:
-            ``_is_sharded``: ``True`` if the Parameter is sharded or ``False``
-                if the Parameter is intentionally not sharded (in which case we
-                will all-reduce grads for this param). Currently the way
-                `_is_sharded = False` is if world_size = 1 or sharding strategy
-                is NO_SHARD.
+    def _init_param_attributes(self, handle: FlatParamHandle) -> None:
+        """
+        We manage several attributes on each Parameter instance.
         A few attributes are set here:
             ``_local_shard``: a single shard of the parameter. This is needed to
                 recover the shard after rebuilding full parameter in forward
@@ -1596,10 +1821,10 @@ def _init_param_attributes(self, p: FlatParameter) -> None:
                 appropriate size and then has its storage freed. This will be
                 resized in place and only materialized (via all-gather) as needed.
         Another attribute is set by :func:`_register_post_backward_hooks`:
-            ``_shard_bwd_hook``: it holds the parameter's AccumulateGrad object
+            ``_post_backward_hook_state``: it holds the parameter's AccumulateGrad object
                 and the registered post hook handle.
         """
-        assert hasattr(p, "_is_sharded"), "Parameters should have been sharded during construction."
+        p = handle.flat_param
         # If _local_shard has been set in the first lazy init and
         # current parameter is pointed to _local_shard, no need to
         # set the _local_shard again.
@@ -1629,7 +1854,7 @@ def _init_param_attributes(self, p: FlatParameter) -> None:
         # transfer.
         if self.cpu_offload.offload_params:
             assert p._local_shard.device == torch.device("cpu")  # type: ignore[attr-defined]
-            p._local_shard.pin_memory()  # type: ignore[attr-defined]
+            p._local_shard = p._local_shard.pin_memory()  # type: ignore[attr-defined]
             # When offloading parameters, also move the grad shard to CPU during
             # backward pass. In this case, it's important to pre-allocate the
             # CPU grad shard in pinned memory so that we can do a non-blocking
@@ -1659,7 +1884,7 @@ def _init_param_attributes(self, p: FlatParameter) -> None:
         # as needed. The storage may contain padding elements so that it is
         # evenly divisible by world_size, although these padding elements will
         # be removed before the relevant computation.
-        if p._is_sharded:  # type: ignore[attr-defined]
+        if handle.uses_sharded_strategy:  # type: ignore[attr-defined]
             # We set p._full_param_padded's dtype to the desired parameter dtype
             # in the case of mixed precision. This is so that when we all_gather
             # into full_param_padded it can occur without issues and result in
@@ -1673,8 +1898,17 @@ def _init_param_attributes(self, p: FlatParameter) -> None:
                 device=self.compute_device,
                 dtype=full_param_dtype,
             )
+            p._padded_unsharded_size = p._full_param_padded.size()  # type: ignore[attr-defined]
             _free_storage(p._full_param_padded)  # type: ignore[attr-defined]
 
+            if self._mixed_precision_enabled_for_params():
+                p._full_prec_full_param_padded = torch.zeros(  # type: ignore[attr-defined]
+                    p.numel() * self.world_size,
+                    device=self.compute_device,
+                    dtype=p.dtype,  # full precision
+                )
+                _free_storage(p._full_prec_full_param_padded)
+
         # Track whether the `FlatParameter`'s post-backward hook has been
         # called for validation in `_wait_for_post_backward()`
         p._post_backward_called = False
@@ -1683,15 +1917,13 @@ def _init_streams(self) -> None:
         """Initializes CUDA streams for overlapping data transfer and
         computation. This should only be called on the root FSDP instance."""
         assert self._is_root
-        if torch.cuda.is_available():
-            # Stream for all-gathering parameters.
-            self._streams["all_gather"] = torch.cuda.Stream()
-            # Stream for overlapping grad reduction with the backward pass.
-            self._streams["post_backward"] = torch.cuda.Stream()
-            # Stream to move main params to self.mixed_precision.param_dtype
-            # for forward pass.
-            if self._mixed_precision_enabled_for_params():
-                self._streams["mixed_precision_params"] = torch.cuda.Stream()
+        assert torch.cuda.is_available()
+        # Stream for all-gathering parameters.
+        self._streams["all_gather"] = torch.cuda.Stream()
+        # Stream for overlapping grad reduction with the backward pass.
+        self._streams["post_backward"] = torch.cuda.Stream()
+        # Stream for pre-all-gather copies (e.g. H2D or precision cast).
+        self._streams["pre_all_gather"] = torch.cuda.Stream()
 
     def _wait_for_previous_optim_step(self) -> None:
         """
@@ -1699,50 +1931,94 @@ def _wait_for_previous_optim_step(self) -> None:
         synchronize with the default stream to ensure that the previous
         optimizer step is done.
         """
-        if not torch.cuda.is_available() or not self._is_root:
+        if not self._is_root:
             return
-        if self._mixed_precision_enabled_for_params():
-            self._streams["mixed_precision_params"].wait_stream(
-                torch.cuda.current_stream()
-            )
-        self._streams["all_gather"].wait_stream(torch.cuda.current_stream())
+        current_stream = torch.cuda.current_stream()
+        self._streams["all_gather"].wait_stream(current_stream)
+        # Having the pre-all-gather stream wait for the current stream even if
+        # we do not leverage the pre-all-gather stream is tolerable since this
+        # only runs once per iteration
+        self._streams["pre_all_gather"].wait_stream(current_stream)
+
+    def _prefetch_handles(
+        self,
+        current_handles_key: _HandlesKey,
+    ) -> None:
+        """
+        Prefetches the next handles if needed (without synchronization). An
+        empty handles key cannot prefetch.
+        """
+        if not current_handles_key:
+            return
+        handles_to_prefetch = self._get_handles_to_prefetch(current_handles_key)
+        if handles_to_prefetch is not None:
+            # Prefetch the next set of handles without synchronizing to allow
+            # the sync to happen as late as possible to maximize overlap
+            self._unshard(handles_to_prefetch)
+            self._handles_prefetched[handles_to_prefetch] = True
+
+    def _get_handles_to_prefetch(
+        self,
+        current_handles_key: _HandlesKey,
+    ) -> Optional[_HandlesKey]:
+        """
+        Returns the handles to prefetch if the module corresponding to
+        ``current_handles_key`` should prefetch for the next module and
+        ``None`` otherwise.
 
-    def _need_prefetch_full_params(self, state: TrainingState_) -> bool:
-        allowed_states = (
-            TrainingState_.FORWARD, TrainingState_.BACKWARD_PRE, TrainingState_.BACKWARD_POST
+        "Prefetching" refers to running the unshard logic early (without
+        synchronization), and the "next" module depends on the recorded
+        execution order and the current training state.
+        """
+        training_state = self._get_training_state(current_handles_key)
+        valid_training_states = (
+            HandleTrainingState.BACKWARD_PRE,
+            HandleTrainingState.BACKWARD_POST,
         )
-        assert state in allowed_states, f"state needs to be in the set of {allowed_states}"
-        valid_fsdp_graph_and_index = (
-            self._fsdp_graph_order is not None
-            and self._my_fsdp_idx_in_graph is not None
+        p_assert(
+            training_state in valid_training_states,
+            f"Prefetching is only supported in {valid_training_states} but "
+            f"currently in {training_state}"
         )
-        if state == TrainingState_.FORWARD:
-            return (
-                self.forward_prefetch
-                and valid_fsdp_graph_and_index
-                and self._my_fsdp_idx_in_graph < len(self._fsdp_graph_order) - 1
-                and self._fsdp_graph_order[self._my_fsdp_idx_in_graph + 1].training_state
-                != TrainingState_.FORWARD
-            )
-        elif state == TrainingState_.BACKWARD_PRE:
-            return (
-                self.backward_prefetch == BackwardPrefetch.BACKWARD_PRE
-                and valid_fsdp_graph_and_index
-                and self._my_fsdp_idx_in_graph > 0
-                and self._fsdp_graph_order[self._my_fsdp_idx_in_graph - 1].training_state
-                != TrainingState_.BACKWARD_POST
-            )
-        else:
-            return (
-                self.backward_prefetch == BackwardPrefetch.BACKWARD_POST
-                and valid_fsdp_graph_and_index
-                and self._my_fsdp_idx_in_graph > 0
-                and self._fsdp_graph_order[self._my_fsdp_idx_in_graph - 1].training_state
-                != TrainingState_.BACKWARD_POST
-                and self._fsdp_graph_order[
-                    self._my_fsdp_idx_in_graph - 1
-                ]._need_rebuild_full_params
-            )
+        eod = self._exec_order_data
+        if (
+            training_state == HandleTrainingState.BACKWARD_PRE
+            and self.backward_prefetch == BackwardPrefetch.BACKWARD_PRE
+        ):
+            target_handles_key = eod.get_handles_to_backward_prefetch(current_handles_key)
+            if target_handles_key is not None:
+                target_training_state = self._get_training_state(target_handles_key)
+                if (
+                    target_training_state != HandleTrainingState.BACKWARD_POST
+                    and self._needs_pre_backward_unshard.get(target_handles_key, False)
+                ):
+                    return target_handles_key
+        elif (
+            training_state == HandleTrainingState.BACKWARD_POST
+            and self.backward_prefetch == BackwardPrefetch.BACKWARD_POST
+        ):
+            target_handles_key = eod.get_handles_to_backward_prefetch(current_handles_key)
+            if target_handles_key is not None:
+                target_training_state = self._get_training_state(target_handles_key)
+                if (
+                    target_training_state != HandleTrainingState.BACKWARD_POST
+                    and self._needs_pre_backward_unshard.get(target_handles_key, False)
+                ):
+                    return target_handles_key
+        return None
+
+    def _get_training_state(
+        self,
+        handles_key: _HandlesKey,
+    ) -> HandleTrainingState:
+        """Returns the training state of the handles in ``handles_key``."""
+        p_assert(len(handles_key) > 0, "Expects a non-empty handles key")
+        training_states = set(handle._training_state for handle in handles_key)
+        p_assert(
+            len(training_states) == 1,
+            f"Expects uniform training state but got {training_states}"
+        )
+        return next(iter(training_states))
 
     @staticmethod
     @contextlib.contextmanager
@@ -1928,7 +2204,7 @@ def _local_post_state_dict_hook(
         flat_param = getattr(self._fsdp_wrapped_module, FLAT_PARAM, None)
         assert flat_param is not None
         # Construct a ShardedTensor from the flat_param.
-        full_numel = flat_param._unsharded_size.numel()
+        full_numel = flat_param._unpadded_unsharded_size.numel()  # type: ignore[attr-defined]
         shard_offset = flat_param.numel() * self.rank
         valid_data_size = flat_param.numel() - flat_param._shard_numel_padded
         if valid_data_size > 0 and flat_param._shard_numel_padded > 0:
@@ -2086,7 +2362,7 @@ def state_dict(self, *args, **kwargs):
         ):
             if (
                 self._fsdp_wrapped_module.flat_param is not None and
-                not self._fsdp_wrapped_module.flat_param._is_sharded
+                not self._fsdp_wrapped_module.handle.uses_sharded_strategy
             ):
                 raise RuntimeError(
                     "sharded_state_dict/local_state_dict can only be called "
@@ -2195,7 +2471,7 @@ def _sharded_pre_load_state_dict_hook(
         if not self._fsdp_wrapped_module.has_params:
             return
 
-        if not self._fsdp_wrapped_module.flat_param._is_sharded:
+        if not self._fsdp_wrapped_module.handle.uses_sharded_strategy:
             raise RuntimeError(
                 "load_sharded_state_dict can only be called when parameters "
                 "are flatten and sharded."
@@ -2345,228 +2621,148 @@ def _load_sharded_state_dict(
             return self.load_state_dict(state_dict, strict)
 
     def forward(self, *args: Any, **kwargs: Any) -> Any:
+        """
+        Runs the forward pass for the wrapped module, inserting FSDP-specific
+        pre- and post-forward sharding logic.
+        """
         with torch.autograd.profiler.record_function("FullyShardedDataParallel.forward"):
             self._lazy_init()
-            self._wait_for_previous_optim_step()
-
-            # Start of a forward pass.
-            self.training_state = TrainingState_.FORWARD
-            if self._is_root:
-                # TODO: disabling side stream for tensor copies for now, investigate
-                # perf with it on / off.
-                # Place inputs on compute_device. This is a noop if inputs are already
-                # on compute_device. Note that when device_id is specified,
-                # device_id == self.compute_device is guaranteed.
-                # TODO: for mixed precision, move inputs to right device + cast might
-                # be done in one go for performance.
-                args, kwargs = _to_kwargs(args, kwargs, self.compute_device.index, False)
-                args = args[0]
-                kwargs = kwargs[0]
-
-            # Cast inputs to their mixed precision type.
-            if (
-                self._is_root
-                and self._mixed_precision_enabled_for_params()
-            ):
-                input_dtype = self.mixed_precision.param_dtype
-                args, kwargs = self._cast_fp_inputs_to_precision(
-                    input_dtype, *args, **kwargs
+            args, kwargs = self._fsdp_root_pre_forward(*args, **kwargs)
+            unused = None
+            unshard_fn = functools.partial(self._pre_forward_unshard, handles=self._handles)
+            # Do not free the root's parameters in the post-forward for
+            # `FULL_SHARD` with the intention that they are immediately used
+            # for backward computation (though this may not be true)
+            free_unsharded_flat_params = [
+                not self._is_root
+                and handle._config.sharding_strategy == HandleShardingStrategy.FULL_SHARD
+                for handle in self._handles
+            ]
+            reshard_fn = functools.partial(
+                self._reshard,
+                self._handles,
+                free_unsharded_flat_params,
+            )
+            self._pre_forward(self._handles, unshard_fn, unused, unused)
+            for handle in self._handles:
+                p_assert(
+                    handle.flat_param.device == self.compute_device,
+                    "Expected `FlatParameter` to be on the compute device "
+                    f"{self.compute_device} but got {handle.flat_param.device}"
                 )
+            output = self._fsdp_wrapped_module(*args, **kwargs)
+            return self._post_forward(self._handles, reshard_fn, unused, unused, output)
 
-            # Only rebuilding full params when the params are not prefetched in previous layers
-            if not self._forward_full_params_prefetched:
-                self._rebuild_full_params()
-            self._forward_full_params_prefetched = False
-            # Wait for all_gather full parameters to finish before computation
-            torch.cuda.current_stream().wait_stream(self._streams["all_gather"])
-
-            # Prefetch next layer's full params in forward pass
-            if self._need_prefetch_full_params(self.training_state):
-                # This guarantees that pre-fetching is initialized only after all
-                # previous computations are finished. Therefore, all gather next layer's
-                # parameters will only overlap with this layer's computation. This
-                # prevents over-prefetching, where multiple layer's parameters are prefetched
-                # before the computation.
-                self._streams["all_gather"].wait_stream(torch.cuda.current_stream())
-                self._fsdp_graph_order[self._my_fsdp_idx_in_graph + 1]._rebuild_full_params()
-                self._fsdp_graph_order[self._my_fsdp_idx_in_graph + 1]._forward_full_params_prefetched = True
-
-            # Register backward hooks to reshard params and reduce-scatter grads.
-            # These need to be re-registered every forward pass in some cases where grad_fn
-            # is mutated.
-            self._register_post_backward_hooks()
-            outputs = self._fsdp_wrapped_module(*args, **kwargs)
-
-            if self not in self._fsdp_graph_order:
-                self._my_fsdp_idx_in_graph = len(self._fsdp_graph_order)
-                self._fsdp_graph_order.append(self)
-
-            if self.reshard_after_forward:
-                self._free_full_params()
-                if (
-                    self._mixed_precision_enabled_for_params()
-                ):
-                    self._free_mp_shard(self.params)
-            # Switch to original local shards of params. We maintain this invariant throughout
-            # the code, i.e., ``p.data == p._local_shard`` after each function. This
-            # also ensures that after the first forward, the optimizer state will be
-            # initialized with the correct dtype and (sharded) size, since optimizer
-            # state is typically initialized lazily in ``optim.step()``. Note that
-            # when CPU offload is enabled, _use_param_local_shard implicitly
-            # offloads the local shard to CPU by making p.data point to
-            # p._local_shard, which would reside on CPU.
-            self._use_param_local_shard()
-
-            # Register pre-backward hooks to all-gather the params for the backward
-            # pass (if output's grad was needed). This won't register anything if
-            # we are in eval mode.
-            outputs = self._register_pre_backward_hooks(outputs)
-
-            # Done with a forward pass.
-            self.training_state = TrainingState_.IDLE
-
-        return outputs
-
-    @torch.no_grad()
-    def _write_back_current_shard(self, full_params):
-        """
-        Writes back full_params into self.params.
+    def _pre_forward(
+        self,
+        handles: List[FlatParamHandle],
+        unshard_fn: Optional[Callable],
+        module: nn.Module,
+        input: Any,
+    ):
         """
-        for p, (full_param, _) in zip(self.params, full_params):
-            if not p._is_sharded:  # type: ignore[attr-defined]
-                continue  # Already copied because no sharding.
+        Runs the pre-forward logic. This includes an opportunity to unshard
+        currently sharded parameters such as those for the current forward and
+        registering post-backward hooks for these current parameters.
 
-            # TODO: Might be able to refactor to use _get_shard.
-            chunks = full_param.chunk(self.world_size)  # type: ignore[attr-defined]
-            assert len(chunks) > self.rank
-            chunk = chunks[self.rank]
-            p._local_shard.copy_(chunk)  # type: ignore[attr-defined]
+        Args:
+            handles (List[FlatParamHandle]): Handles giving the parameters
+                used in the current forward.
+            unshard_fn (Optional[Callable]): A callable to unshard any
+                currently sharded parameters or ``None`` to not do any
+                unsharding.
+            module (nn.Module): Unused; expected by the hook signature.
+            input (Any): Unused; expected by the hook signature.
+        """
+        self.training_state = TrainingState_.FORWARD
+        self._exec_order_data.record_pre_forward(handles, self.training)
+        for handle in handles:
+            handle._training_state = HandleTrainingState.FORWARD
+        if unshard_fn is not None:
+            unshard_fn()
+        # Register post-backward hooks to reshard the parameters and
+        # reduce-scatter their gradients. They must be re-registered every
+        # forward pass in case the `grad_fn` is mutated.
+        self._register_post_backward_hooks(handles)
+
+    def _pre_forward_unshard(
+        self,
+        handles: List[FlatParamHandle],
+    ) -> None:
+        """Unshards parameters in the pre-forward."""
+        self._unshard(handles)
+        torch.cuda.current_stream().wait_stream(self._streams["all_gather"])
 
-    @contextlib.contextmanager
-    def _summon_full_params(
+    def _post_forward(
         self,
-        recurse: bool = True,
-        writeback: bool = True,
-        rank0_only: bool = False,
-        offload_to_cpu: bool = False,
-    ):
-        if writeback and rank0_only:
-            raise ValueError(
-                "writeback=True and rank0_only=True is not supported, as model "
-                "parameter shapes will be different across ranks, and writing "
-                "to them can lead to inconsistencies across ranks when the "
-                "context is exited."
-            )
+        handles: List[FlatParamHandle],
+        reshard_fn: Optional[Callable],
+        module: nn.Module,
+        input: Any,
+        output: Any,
+    ) -> Any:
+        """
+        Runs the post-forward logic. This includes an opportunity to reshard
+        currently unsharded parameters such as those used in the current
+        forward and registering pre-backward hooks on the forward outputs.
 
-        if offload_to_cpu and not rank0_only:
-            warnings.warn(
-                "offload_to_cpu and rank0_only=False will result in "
-                "full parameters being redundantly copied to CPU memory for "
-                "GPUs that reside on the same machine, which may incur the risk of "
-                "CPU OOM. It is recommended to use ``offload_to_cpu`` with "
-                "rank0_only=True."
+        Args:
+            handles (List[FlatParamHandle]): Handles giving the parameters
+                used in the current forward.
+            reshard_fn (Optional[Callable]): A callable to reshard any
+                currently unsharded parameters (e.g. from the current forward)
+                or ``None`` to not do any resharding.
+            module (nn.Module): Unused; expected by the hook signature.
+            input (Any): Unused; exepcted by the hook signature.
+            output (Any): Forward pass output; pre-backward hooks are
+                registered on the tensors that require gradients in this
+                output.
+
+        Postcondition: Each ``FlatParameter`` 's data points to the sharded
+        flattened parameter.
+        """
+        self._exec_order_data.record_post_forward(handles)
+        if reshard_fn is not None:
+            reshard_fn()
+        # Register pre-backward hooks to unshard the flattened parameters
+        # for the gradient computation (if needed)
+        output = self._register_pre_backward_hooks(output, handles)
+        self.training_state = TrainingState_.IDLE
+        for handle in handles:
+            handle._training_state = HandleTrainingState.IDLE
+        return output
+
+    def _cast_forward_inputs(self, *args, **kwargs):
+        """Moves the forward inputs to the compute device and casts them to the
+        appropriate dtype if needed."""
+        # TODO: Do not use the side stream for tensor copies for now;
+        # investigate the perf with/without it
+        # TODO: For mixed precision, move the inputs to the compute device and
+        # cast to reduced-precision in a single `to()` call
+        args, kwargs = _to_kwargs(args, kwargs, self.compute_device.index, False)
+        args = args[0]
+        kwargs = kwargs[0]
+        if self._mixed_precision_enabled_for_params():
+            input_dtype = self.mixed_precision.param_dtype
+            args, kwargs = self._cast_fp_inputs_to_dtype(
+                input_dtype, *args, **kwargs,
             )
+        return args, kwargs
 
-        def _free_full_params_and_use_local_shard(params_to_free):
-            # We may not always be able to free the full param, for example in
-            # the case where world_size == 1 and the shard actually points to
-            # the full parameter.
-            for (param, can_free) in params_to_free:
-                if can_free:
-                    current_stream = torch.cuda.current_stream()
-                    # Don't let PyTorch reuse this memory until all work in the
-                    # current stream is complete
-                    param.record_stream(current_stream)
-                    _free_storage(param)
-
-            # when CPU offload is enabled, _use_param_local_shard implicitly
-            # offloads the local shard to CPU by making p.data point to
-            # p._local_shard, which would reside on CPU.
-            self._use_param_local_shard()
-
-        if recurse:
-            with contextlib.ExitStack() as stack:
-                # Summon all params for any nested FSDP instances.
-                for module in self.fsdp_modules(self):
-                    stack.enter_context(
-                        module._summon_full_params(
-                            recurse=False,
-                            writeback=writeback,
-                            rank0_only=rank0_only,
-                            offload_to_cpu=offload_to_cpu,
-                        )
-                    )
-                # Yield to the caller, with full params in all nested instances.
-                yield
-            # Exiting from the ExitStack will re-shard params.
-            return
-        else:
-            torch.cuda.synchronize()
-            self._lazy_init()
-            self._assert_state([TrainingState_.IDLE])
-            # Set the state so that we assert when trying to go into
-            # forward/backward.
-            self.training_state = TrainingState_.SUMMON_FULL_PARAMS
-
-            # Even if rank0_only = True, we need to materialize all params here
-            # and free them right after as full param materialization requires
-            # collective comm.
-            currently_local_params = self._rebuild_full_params()
-            # Wait for all_gather to finish before computation
-            torch.cuda.current_stream().wait_stream(self._streams["all_gather"])
-            my_rank = dist.get_rank(self.process_group)
-            if offload_to_cpu and (not rank0_only or my_rank == 0):
-                for p in self.params:
-                    if p._is_sharded:
-                        with torch.no_grad():
-                            # Note that we avoid using p._full_param_padded
-                            # directly here as we may not be using that param
-                            # as the full_param from _rebuild_full_params (i.e.)
-                            # in mixed precision.
-                            for p, (full_param, _) in zip(
-                                self.params, currently_local_params
-                            ):
-                                full_param = full_param.to(torch.device("cpu"))
-                                self._update_p_data(p, output_tensor=full_param)
-
-            if rank0_only and my_rank != 0:
-                _free_full_params_and_use_local_shard(currently_local_params)
-                try:
-                    yield
-                finally:
-                    self.training_state = TrainingState_.IDLE
-            else:
-                # FSDP now has the full flattened parameter. Unflatten it to get the
-                # full parameters.
-                with contextlib.ExitStack() as stack:
-                    # Invariant: rank == 0 or !rank0_only
-                    stack.enter_context(self._fsdp_wrapped_module.unflatten_as_params())
-                    try:
-                        yield
-                    finally:
-                        if offload_to_cpu and (not rank0_only or my_rank == 0):
-                            for p in self.params:
-                                if p._is_sharded:
-                                    with torch.no_grad():
-                                        # Note that we avoid using
-                                        # p._full_param_padded directly here as
-                                        # we may not be using that param
-                                        # as the full_param from
-                                        # _rebuild_full_params (i.e. in mixed
-                                        # precision.
-                                        for p, (full_param, _) in zip(
-                                            self.params, currently_local_params
-                                        ):
-                                            full_param = full_param.to(self.compute_device)
-                                            self._update_p_data(
-                                                p, output_tensor=full_param,
-                                            )
-
-                        if writeback:
-                            self._write_back_current_shard(currently_local_params)
-                        stack.close()
-                        _free_full_params_and_use_local_shard(currently_local_params)
-                        self.training_state = TrainingState_.IDLE
+    def _fsdp_root_pre_forward(self, *args, **kwargs):
+        """
+        Runs pre-forward logic specific to the root FSDP instance, which should
+        run before any individual module's pre-forward. This includes
+        synchronizing with the previous iteration and casting the forward
+        inputs appropriately. If this is called on a non-root FSDP instance,
+        then the forward inputs are returned directly.
+        """
+        p_assert(self._is_root is not None, "Expects a root FSDP to have been set")
+        if not self._is_root:
+            return args, kwargs
+        self._wait_for_previous_optim_step()
+        args, kwargs = self._cast_forward_inputs(*args, **kwargs)
+        return args, kwargs
 
     @staticmethod
     @contextlib.contextmanager
@@ -2614,7 +2810,7 @@ def summon_full_params(
             recurse (bool, Optional): recursively summon all params for nested
                 FSDP instances (default: True).
             writeback (bool, Optional): if ``False``, modifications to params are
-                discarded after the context manager exists;
+                discarded after the context manager exits;
                 disabling this can be slightly more efficient (default: True)
             rank0_only (bool, Optional): if ``True``, full parameters are
                 materialized on only global rank 0. This means that within the
@@ -2633,12 +2829,12 @@ def summon_full_params(
         """
         # Note that we specify root_only as FSDP roots will handle summoning
         # child FSDP instances based on recurse argument.
-        fsdp_modules = FullyShardedDataParallel.fsdp_modules(
+        root_fsdp_modules = FullyShardedDataParallel.fsdp_modules(
             module, root_only=True
         )
         # Summon all params for all FSDP instances
         with contextlib.ExitStack() as stack:
-            for module in fsdp_modules:
+            for module in root_fsdp_modules:
                 stack.enter_context(
                     module._summon_full_params(
                         recurse=recurse,
@@ -2652,6 +2848,109 @@ def summon_full_params(
         # Exiting from the ExitStack will reshard all params.
         return
 
+    @contextlib.contextmanager
+    def _summon_full_params(
+        self,
+        recurse: bool = True,
+        writeback: bool = True,
+        rank0_only: bool = False,
+        offload_to_cpu: bool = False,
+    ):
+        if writeback and rank0_only:
+            raise ValueError(
+                "writeback=True and rank0_only=True is not supported, as model "
+                "parameter shapes will be different across ranks, and writing "
+                "to them can lead to inconsistencies across ranks when the "
+                "context is exited."
+            )
+        if offload_to_cpu and not rank0_only:
+            warnings.warn(
+                "offload_to_cpu and rank0_only=False will result in "
+                "full parameters being redundantly copied to CPU memory for "
+                "GPUs that reside on the same machine, which may incur the risk of "
+                "CPU OOM. It is recommended to use ``offload_to_cpu`` with "
+                "rank0_only=True."
+            )
+
+        if recurse:
+            with contextlib.ExitStack() as stack:
+                for module in self.fsdp_modules(self):
+                    stack.enter_context(
+                        module._summon_full_params(
+                            recurse=False,
+                            writeback=writeback,
+                            rank0_only=rank0_only,
+                            offload_to_cpu=offload_to_cpu,
+                        )
+                    )
+                yield
+            return
+
+        torch.cuda.synchronize()
+        self._lazy_init()
+        self._assert_state([TrainingState_.IDLE])
+        for handle in self._handles:
+            assert handle._training_state == HandleTrainingState.IDLE
+        self.training_state = TrainingState_.SUMMON_FULL_PARAMS
+        for handle in self._handles:
+            handle._training_state = HandleTrainingState.SUMMON_FULL_PARAMS
+
+        free_unsharded_flat_params = [handle.needs_unshard() for handle in self._handles]
+        self._unshard(self._handles)
+        torch.cuda.current_stream().wait_stream(self._streams["all_gather"])
+
+        if rank0_only and self.rank != 0:
+            # Free the unsharded flattened parameter early
+            self._reshard(self._handles, free_unsharded_flat_params)
+            try:
+                yield
+            finally:
+                self.training_state = TrainingState_.IDLE
+                for handle in self._handles:
+                    handle._training_state = HandleTrainingState.IDLE
+        else:
+            # Unflatten the unsharded flattened parameters
+            with contextlib.ExitStack() as stack:
+                # Invariant: rank == 0 or !rank0_only
+                for handle in self._handles:
+                    if offload_to_cpu and handle.uses_sharded_strategy:
+                        stack.enter_context(handle.to_cpu())
+                # TODO (awgu): This FPW call assumes 1 `FlatParameter`
+                stack.enter_context(self._fsdp_wrapped_module.unflatten_as_params())
+                try:
+                    yield
+                finally:
+                    stack.close()
+                    if writeback:
+                        self._write_back_to_local_shard(self._handles)
+                    self._reshard(self._handles, free_unsharded_flat_params)
+                    self.training_state = TrainingState_.IDLE
+                    for handle in self._handles:
+                        handle._training_state = HandleTrainingState.IDLE
+
+    @torch.no_grad()
+    def _write_back_to_local_shard(self, handles: List[FlatParamHandle]):
+        """
+        For each handle, writes back the this rank's shard of the unsharded
+        flattened parameter to the sharded flattened parameter.
+
+        Precondition: Each handle's ``FlatParameter`` 's data points to the
+        padded unsharded flattened parameter.
+        """
+        for handle in handles:
+            # For `NO_SHARD`, `_local_shard` is the unsharded flattened
+            # parameter as well
+            if not handle.uses_sharded_strategy:
+                continue
+            assert (
+                handle.flat_param.ndim == 1
+            ), f"Expects `flat_param` to be flattened but got {handle.flat_param.shape}"
+            # Get the unpadded shard instead of the padded shard to persist
+            # user changes to the padding (though FSDP does not explicitly
+            # support this)
+            shard, _ = FlatParamHandle._get_unpadded_shard(handle.flat_param, handle.rank, handle.world_size)
+            handle.flat_param._local_shard[:shard.numel()].copy_(shard)
+
     def named_buffers(
         self,
         *args,
@@ -2689,157 +2988,150 @@ def named_parameters(
                 param_name = param_name.replace(FSDP_PREFIX, "")
             yield (param_name, param)
 
-    def _register_pre_backward_hooks(self, outputs: Any) -> Any:
-        """Register pre-backward hook to run before the wrapped module's
-        backward. Hooks should be attached to all outputs from the forward.
-        Returns:
-            outputs: new outputs with hooks registered if they requires gradient.
+    def _register_pre_backward_hooks(
+        self,
+        outputs: Any,
+        handles: List[FlatParamHandle],
+    ) -> Any:
         """
-        # Reset before each backward pass
-        self._need_rebuild_full_params = False
+        Registers pre-backward hooks on the tensors that require gradients in
+        the forward pass outputs ``outputs``, which were computed using the
+        ``FlatParameter`` s of ``handles``.
 
+        Returns:
+            Forward pass outputs with pre-backward hooks registered to tensors
+            that require gradients.
+        """
+        # If there is no gradient computation, then there is no need for
+        # pre-backward logic
         if not torch.is_grad_enabled():
-            return outputs  # don't register hooks if grad isn't enabled
+            return outputs
 
         if self._is_root:
-            # This actually means that only root instance has
-            # _post_backward_callback_queued defined. Accidentally accessing this field
-            # will assert on all other instances, giving us a nice bug checker.
-            self._post_backward_callback_queued = False
-
-        # Reset before each backward pass
-        self._pre_backward_hook_has_run = False
-
-        def _pre_backward_hook(*unused: Any) -> None:
-            # Run ``_pre_backward_hook`` only once per backward pass
-            if self._pre_backward_hook_has_run:
+            self._post_backward_callback_queued = False  # only defined on the root
+
+        handles_key = tuple(handles)
+        if handles_key:
+            # Since these handles' `FlatParameter`s participated in a forward,
+            # we conservatively assume that they will be used in the backward
+            self._needs_pre_backward_unshard[handles_key] = False
+            self._ran_pre_backward_hook[handles_key] = False
+
+        def _pre_backward_hook(_handles: List[FlatParamHandle], *unused: Any) -> None:
+            """Prepares ``_handles`` 's ``FlatParameter`` s for gradient
+            computation."""
+            _handles_key = tuple(_handles)  # avoid shadowing `handles_key`
+            # Only run the pre-backward hook once per group of handles involved
+            # in the same module forward computation
+            if _handles_key and self._ran_pre_backward_hook[_handles_key]:
                 return
 
-            with torch.autograd.profiler.record_function("FullyShardedDataParallel._pre_backward_hook"):
-                # try to queue final backward callback only once for root, so
-                # that final backward callback is attached to the outer most
-                # backward graph task and called after all the backward
-                # calls are completed.
-                if self._is_root:
+            with torch.autograd.profiler.record_function(
+                "FullyShardedDataParallel._pre_backward_hook"
+            ):
+                # Queue the post-backward callback once for the root FSDP
+                # instance to attach it to the outermost backward graph task so
+                # that it is called after all backward calls complete
+                if self._is_root and not self._post_backward_callback_queued:
                     self._queue_wait_for_post_backward()
-
-                if self._pre_backward_hook_full_params_prefetched:
-                    # Always wait for all_gather before rebuilding full params, just
-                    # in case full params have already been prefetched in previous layer's
-                    # pre-backward hook.
-                    torch.cuda.current_stream().wait_stream(self._streams["all_gather"])
-
-                # Start of a backward pass for the first time in an backward pass.
-                self._assert_state([TrainingState_.IDLE])
+                elif _handles_key:
+                    self._assert_state([TrainingState_.IDLE])
                 self.training_state = TrainingState_.BACKWARD_PRE
-
-                # All-gather full parameters, moving them to compute device if
-                # necessary.
-                self._rebuild_full_params()
-                self._pre_backward_hook_full_params_prefetched = False
-                # Wait for all_gather to finish before computation
+                # Queueing the post-backward callback is the only logic that is
+                # not per-handle in the pre-backward hook, so we can return
+                # early here if there are no handles.
+                if not _handles_key:
+                    return
+                for handle in _handles:
+                    handle._training_state = HandleTrainingState.BACKWARD_PRE
+
+                # If the handles have been prefetched, this `_unshard()` simply
+                # switches to using the unsharded parameter
+                self._unshard(_handles)
                 torch.cuda.current_stream().wait_stream(self._streams["all_gather"])
 
-                # Prefetch next layer's full params in backward pass,
-                # since it is prefetching, no need to wait for all_gather stream.
-                if self._need_prefetch_full_params(self.training_state):
-                    self._fsdp_graph_order[self._my_fsdp_idx_in_graph - 1]._rebuild_full_params()  # type: ignore[operator]
-                    self._fsdp_graph_order[self._my_fsdp_idx_in_graph - 1]._pre_backward_hook_full_params_prefetched = True
-
-                self._pre_backward_hook_has_run = True
-                # Prepare p.grad so that it is in the right shape, device, accumulated values, etc.
-                self._prep_grads_for_backward()
+                # Set this to `False` to ensure that a mistargeted prefetch
+                # does not actually unshard these handles
+                self._needs_pre_backward_unshard[_handles_key] = False
+                self._prefetch_handles(_handles_key)
+                for handle in _handles:
+                    handle.prepare_gradient()
+                self._ran_pre_backward_hook[_handles_key] = True
 
         def _register_hook(t: torch.Tensor) -> torch.Tensor:
             if t.requires_grad:
-                t.register_hook(_pre_backward_hook)
-                self._need_rebuild_full_params = True
+                t.register_hook(functools.partial(_pre_backward_hook, handles))
+                self._needs_pre_backward_unshard[handles_key] = True
             return t
 
-        # Attach hooks to Tensor outputs.
-        outputs = _apply_to_tensors(_register_hook, outputs)
-
-        return outputs
-
-    def _register_post_backward_hooks(self) -> None:
-        """
-        Register backward hooks to reshard params and reduce-scatter grads.
-        This is called during forward pass. The goal is to attach a hook
-        on each of the parameter's gradient generating function (``grad_acc``
-        below) so that the hook is called *after* all gradients for that
-        param are computed.
-        Goals:
-        1. We want the hook to fire once and only once *after* all gradients
-        are accumulated for a param.
-        2. If it fires more than once, we end up incorrectly shard the grad
-        multiple times. (could lead to dimension too small)
-        3. If it fires once but too early or doesn't fire, we leave gradients
-        unsharded. (could lead to dimension too large)
-        Due to multiple-pass forward, this function can be called on
-        the same parameter multiple times in a single forward pass. If we register
-        the hook multiple time, we end up getting called multiple times. We
-        could try to get a new hook every time and delete the previous one
-        registered. However, due to *unknown reason* (I have debugged it for
-        a long time!), in mixed precision mode, we get two different ``grad_acc``
-        objects below during different calls of this function (in the same
-        forward pass). If we keep the last one, the hook end up firing too
-        early. In full precision mode, we luckily get the *same* ``grad_acc``
-        object, so deleting and re-registering still ensured the hook fire
-        once after all gradients are generated.
-        Empirically, keep the first hook register per forward pass seems to
-        work the best. We do need to remove the hook at the end of the
-        backward pass. Otherwise, the next forward pass will not register
-        a new hook, which is needed for a new forward pass.
+        return _apply_to_tensors(_register_hook, outputs)
+
+    def _register_post_backward_hooks(
+        self,
+        handles: List[FlatParamHandle],
+    ) -> None:
         """
+        Registers post-backward hooks on the ``FlatParameter`` s'
+        ``AccumulateGrad`` objects to reshard and to reduce-scatter gradients.
+
+        The ``AccumulateGrad`` object represents the last function that
+        finalizes the ``FlatParameter`` 's gradient, so it only runs after its
+        entire gradient computation has finished.
+
+        We register the post-backward hook only once in the *first* forward
+        that a ``FlatParameter`` participates in. This relies on the
+        ``AccumulateGrad`` object being preserved through multiple forwards.
+        """
+        # If there is no gradient computation, then there is no need for
+        # post-backward logic
         if not torch.is_grad_enabled():
-            return  # don't register grad hooks if grad isn't enabled
-        for p in self.params:
-            if p.requires_grad:
-                if hasattr(p, "_shard_bwd_hook"):
-                    continue
-                # Register a hook on the first call, empirically, autograd
-                # fires it at the end for this param, which makes sense.
-                p_tmp = p.expand_as(p)  # Get a grad_fn on p_tmp.
-                assert (
-                    p_tmp.grad_fn is not None
-                ), "p_tmp grad_fn should not be None, it is used to access \
-                    p's AccumulateGrad object and register post hook on it."
-                grad_acc = p_tmp.grad_fn.next_functions[0][
-                    0
-                ]  # Gets its AccumulateGrad object.
-                handle = grad_acc.register_hook(
-                    functools.partial(self._post_backward_hook, p)
-                )
-                p._shard_bwd_hook = (grad_acc, handle)  # type: ignore[attr-defined]
+            return
+        for handle in handles:
+            flat_param = handle.flat_param
+            already_registered = hasattr(flat_param, "_post_backward_hook_state")
+            if already_registered or not flat_param.requires_grad:
+                continue
+            # Get the `AccumulateGrad` object
+            temp_flat_param = flat_param.expand_as(flat_param)
+            p_assert(
+                temp_flat_param.grad_fn is not None,
+                "The `grad_fn` is needed to access the `AccumulateGrad` and "
+                "register the post-backward hook"
+            )
+            acc_grad = temp_flat_param.grad_fn.next_functions[0][0]
+            hook_handle = acc_grad.register_hook(
+                functools.partial(self._post_backward_hook, handle)
+            )
+            flat_param._post_backward_hook_state = (acc_grad, hook_handle)  # type: ignore[attr-defined]
 
     @torch.no_grad()
-    def _post_backward_hook(self, param: Parameter, *unused: Any) -> None:
-        """
-        At the start of :func:`_post_backward_hook`, ``param.grad`` contains the
-        full gradient for the local batch. The reduce-scatter op will replace
-        ``param.grad`` with a single shard of the summed gradient across all
-        GPUs. This shard will align with the current GPU rank. For example::
-            before reduce_scatter:
-                param.grad (GPU #0): [1, 2, 3, 4]
-                param.grad (GPU #1): [5, 6, 7, 8]
-            after reduce_scatter:
-                param.grad (GPU #0): [6, 8]    # 1+5, 2+6
-                param.grad (GPU #1): [10, 12]  # 3+7, 4+8
-        The local GPU's ``optim.step`` is responsible for updating a single
-        shard of params, also corresponding to the current GPU's rank. This
-        alignment is created by :func:`_shard_parameters`, which ensures that
-        the local optimizer only sees the relevant parameter shard.
+    def _post_backward_hook(
+        self,
+        handle: FlatParamHandle,
+        *unused: Any,
+    ) -> None:
         """
-        p_assert(
-            hasattr(param, '_post_backward_called'),
-            "Expected flag _post_backward_called to exist on param."
-        )
+        Reduce-scatters the gradient of ``handle`` 's ``FlatParameter``.
+
+        Precondition: The ``FlatParameter`` 's ``.grad`` attribute contains the
+        unsharded gradient for the local batch.
+
+        Postcondition:
+        - If using ``NO_SHARD``, then the ``.grad`` attribute is the reduced
+        unsharded gradient.
+        - Otherwise, the ``_saved_grad_shard`` attribute is the reduced sharded
+        gradient (accumulating with any existing gradient).
+        """
+        param = handle.flat_param
         param._post_backward_called = True
-        with torch.autograd.profiler.record_function("FullyShardedDataParallel._post_backward_hook"):
+        with torch.autograd.profiler.record_function(
+            "FullyShardedDataParallel._post_backward_hook"
+        ):
             # First hook callback will see PRE state. If we have multiple params,
             # then subsequent hook callbacks will see POST state.
             self._assert_state([TrainingState_.BACKWARD_PRE, TrainingState_.BACKWARD_POST])
             self.training_state = TrainingState_.BACKWARD_POST
+            handle._training_state = HandleTrainingState.BACKWARD_POST
 
             if self._use_param_exec_order_policy() and self._param_exec_order_prep_stage:
                 # In self._fsdp_params_exec_order, the parameters are ordered based on
@@ -2848,47 +3140,35 @@ def _post_backward_hook(self, param: Parameter, *unused: Any) -> None:
 
             if param.grad is None:
                 return
-
             if param.grad.requires_grad:
                 raise RuntimeError(
                     "FSDP only works with gradients that don't require gradients"
                 )
 
-            if self._should_free_full_params():
-                self._free_full_params(cast(List[FlatParameter], [param]))
+            free_unsharded_flat_param = self._should_free_unsharded_flat_param(handle)
+            self._reshard([handle], [free_unsharded_flat_param])
 
-            if self._mixed_precision_enabled_for_params():
-                # Noop if reshard_after_forward=True because we'd free the param
-                # shard when rebuilding the full params in the pre_beckward_hook.
-                self._free_mp_shard(cast(List[FlatParameter], [param]))
-
-            # Switch to local shard after backward. Note that
-            # when CPU offload is enabled, _use_param_local_shard implicitly
-            # offloads the local shard to CPU by making p.data point to
-            # p._local_shard, which would reside on CPU.
-            self._use_param_local_shard(cast(List[FlatParameter], [param]))
-
-            # Prefetch previous layer's full params in backward pass post backward hook,
-            # If next layer's backward computation is done and full params are freed,
-            # no need to prefetch the full params again.
-            # Only prefetch full params if any of the next layer's outputs requires grad
-            if self._need_prefetch_full_params(self.training_state):
-                self._fsdp_graph_order[self._my_fsdp_idx_in_graph - 1]._rebuild_full_params()  # type: ignore[operator]
-                # Next layer's computation will start right after this all_gather,
-                # Wait for all_gather to finish before computation.
-                torch.cuda.current_stream().wait_stream(self._streams["all_gather"])
+            # TODO (awgu): Post-backward prefetching does not support the
+            # multiple handles per module case (which was why we keyed by
+            # *tuple*). The post-backward hook runs per handle, not per group
+            # of handles. To generalize this, we may need a 2-level mapping,
+            # where we map each individual handle to its groups of handles and
+            # then from the groups of handles to their indices in the order.
+            handles_key = (handle,)
+            self._prefetch_handles(handles_key)
 
             if not self._sync_gradients:
                 return
 
-            # Wait for all work in the current stream to finish, then start the
-            # reductions in post_backward stream.
+            # Wait for all ops in the current stream (e.g. gradient
+            # computation) to finish before reduce-scattering the gradient
             self._streams["post_backward"].wait_stream(torch.cuda.current_stream())
 
             with torch.cuda.stream(self._streams["post_backward"]):
                 orig_grad_data = param.grad.data
                 if (
-                    self._mixed_precision_enabled_for_reduce() and not self._low_precision_hook_enabled()
+                    self._mixed_precision_enabled_for_reduce()
+                    and not self._low_precision_hook_enabled()
                 ):
                     # Cast gradient to precision in which it should be communicated.
                     # If a low precision hook is registered and reduce_dtype is specified
@@ -2913,7 +3193,7 @@ def _post_backward_hook(self, param: Parameter, *unused: Any) -> None:
                         "Communication hook state should not be None"
                     )
                 grad = param.grad.data
-                if param._is_sharded:  # type: ignore[attr-defined]
+                if handle.uses_sharded_strategy:
                     # We clear `param.grad` to permit repeated gradient
                     # computations when this FSDP module is called multiple times.
                     # This is to avoid a race among multiple re-entrant backward
@@ -2960,12 +3240,6 @@ def _post_backward_hook(self, param: Parameter, *unused: Any) -> None:
                         param._saved_grad_shard = output  # type: ignore[attr-defined]
                     grad = param._saved_grad_shard  # type: ignore[attr-defined]
                 else:
-                    # Currently the way for _is_sharded to be False is if
-                    # world_size == 1 or sharding_strategy is NO_SHARD.
-                    assert (
-                        self.world_size == 1 or self.sharding_strategy == ShardingStrategy.NO_SHARD
-                    ), "Currently the way for _is_sharded to be False is \
-                        world_size == 1 or sharding_strategy is set to be NO_SHARD"
                     if self.sharding_strategy == ShardingStrategy.NO_SHARD:
                         self._communication_hook(self._communication_hook_state, param.grad)
 
@@ -2974,7 +3248,7 @@ def _post_backward_hook(self, param: Parameter, *unused: Any) -> None:
                 # Regardless of sharding or not, offload the grad to CPU if we are
                 # offloading params. This is so param and grad reside on same device
                 # which is needed for the optimizer step.
-                if self.cpu_offload.offload_params:
+                if handle._config.offload_params:
                     # We specify non_blocking=True
                     # and ensure the appropriate synchronization is done by waiting
                     # streams in _wait_for_post_backward.
@@ -3020,18 +3294,26 @@ def _cast_grad_to_param_dtype(
             # the cast to full parameter precision completes
             low_prec_grad_data.record_stream(torch.cuda.current_stream())
 
+    def _should_free_unsharded_flat_param(self, handle: FlatParamHandle):
+        return (
+            (self._sync_gradients and handle.uses_sharded_strategy)
+            or handle._config.sharding_strategy == HandleShardingStrategy.FULL_SHARD
+        )
+
     def _queue_wait_for_post_backward(self) -> None:
-        """Try to queue a `wait_for_post_backward` callback.
-        Only called on root and only queue one callback at the beginning of
-        outer most backward.
         """
-        assert (
-            self._is_root
-        ), "_queue_wait_for_post_backward can only be called on root."
-        if not self._post_backward_callback_queued:
-            self._assert_state([TrainingState_.IDLE])
-            self._post_backward_callback_queued = True
-            Variable._execution_engine.queue_callback(self._wait_for_post_backward)
+        Queues a post-backward callback from the root FSDP instance, which
+        should happen at the beginning of its pre-backward.
+        """
+        p_assert(
+            self._is_root,
+            "`_queue_wait_for_post_backward()` should be called on the root FSDP instance"
+        )
+        if self._post_backward_callback_queued:
+            return
+        self._assert_state([TrainingState_.IDLE])
+        self._post_backward_callback_queued = True
+        Variable._execution_engine.queue_callback(self._wait_for_post_backward)
 
     @torch.no_grad()
     def _wait_for_post_backward(self) -> None:
@@ -3051,10 +3333,9 @@ def _wait_for_post_backward(self) -> None:
                 # stream to finish GPU -> CPU copies unless we explicitly block the
                 # host-side with synchronize().
                 torch.cuda.current_stream().synchronize()
+        self._exec_order_data.next_iter()
 
         # A backward pass is done, clean up below.
-        self._exec_order_data.reset()
-
         def _catch_all_reshard(fsdp_module: FullyShardedDataParallel) -> None:
             """
             Reshards full parameters that may have not been resharded in
@@ -3068,19 +3349,19 @@ def _catch_all_reshard(fsdp_module: FullyShardedDataParallel) -> None:
             # and autograd would not print out much useful info about the actual
             # error hit.
             try:
-                for p in fsdp_module.params:
-                    # Parameter is already resharded if the param in use points
-                    # to the local shard.
-                    if p.data.data_ptr() == p._local_shard.data_ptr():
+                free_unsharded_flat_params: List[bool] = []
+                handles_to_reshard: List[FlatParamHandle] = []
+                for handle in fsdp_module._handles:
+                    # TODO: This already-resharded check is brittle:
+                    # https://github.com/pytorch/pytorch/issues/83956
+                    already_resharded = (
+                        handle.flat_param.data_ptr() == handle.flat_param._local_shard.data_ptr()
+                    )
+                    if already_resharded:
                         continue
-
-                    if fsdp_module._should_free_full_params():
-                        fsdp_module._free_full_params(cast(List[FlatParameter], [p]))
-
-                    if fsdp_module._mixed_precision_enabled_for_params():
-                        fsdp_module._free_mp_shard(cast(List[FlatParameter], [p]))
-
-                    fsdp_module._use_param_local_shard(cast(List[FlatParameter], [p]))
+                    free_unsharded_flat_params.append(self._should_free_unsharded_flat_param(handle))
+                    handles_to_reshard.append(handle)
+                self._reshard(handles_to_reshard, free_unsharded_flat_params)
             except Exception as e:
                 p_assert(
                     False,
@@ -3091,17 +3372,17 @@ def _catch_all_reshard(fsdp_module: FullyShardedDataParallel) -> None:
 
         def _finalize_params(fsdp_module: FullyShardedDataParallel) -> None:
             """Helper used below on all fsdp modules."""
-
-            for p in fsdp_module.params:
+            for handle in fsdp_module._handles:
+                p = handle.flat_param
                 if p.requires_grad:
-                    if hasattr(p, "_shard_bwd_hook"):
-                        assert len(p._shard_bwd_hook) == 2 and len(  # type: ignore[attr-defined]
-                            p._shard_bwd_hook  # type: ignore[attr-defined]
+                    if hasattr(p, "_post_backward_hook_state"):
+                        assert len(p._post_backward_hook_state) == 2 and len(  # type: ignore[attr-defined]
+                            p._post_backward_hook_state  # type: ignore[attr-defined]
                         ), (  # type: ignore[attr-defined]
-                            "p._shard_bwd_hook fields are not valid."
+                            "p._post_backward_hook_state fields are not valid."
                         )
-                        p._shard_bwd_hook[1].remove()  # type: ignore[attr-defined]
-                        delattr(p, "_shard_bwd_hook")
+                        p._post_backward_hook_state[1].remove()  # type: ignore[attr-defined]
+                        delattr(p, "_post_backward_hook_state")
                     # Preserve the gradient accumulation state if not
                     # synchronizing: `p.grad` remains the unsharded gradient
                     # accumulated from prior `no_sync()` iterations, and
@@ -3131,9 +3412,9 @@ def _finalize_params(fsdp_module: FullyShardedDataParallel) -> None:
                             p.grad = p._saved_grad_shard
                     else:
                         p_assert(
-                            not p._is_sharded or not p._post_backward_called,
-                            "All sharded parameters that received gradient should "
-                            "use `_saved_grad_shard`"
+                            not handle.uses_sharded_strategy or not p._post_backward_called,
+                            "All sharded parameters that received a gradient "
+                            "should use `_saved_grad_shard`"
                         )
                     if hasattr(p, "_saved_grad_shard"):
                         delattr(p, "_saved_grad_shard")
@@ -3149,9 +3430,11 @@ def _finalize_params(fsdp_module: FullyShardedDataParallel) -> None:
         for m in self.fsdp_modules(self):  # includes self
             _finalize_params(m)
             _catch_all_reshard(m)
-            m._pre_backward_hook_has_run = False
+            m._ran_pre_backward_hook.clear()
             m.training_state = TrainingState_.IDLE
-
+            for handle in m._handles:
+                handle._training_state = HandleTrainingState.IDLE
+            m._handles_prefetched.clear()
             if m._is_root:
                 # reset this flag for cases like "one forward pass + multiple backward passes"
                 self._post_backward_callback_queued = False
@@ -3187,334 +3470,6 @@ def _param_exec_order_policy_second_iter_init(self) -> None:
         # TODO (linjianma): Patch the forward of each model in the keys
         # of fsdp_wrap_map based on the information above.
 
-    def _update_p_data(self, p, output_tensor: torch.Tensor) -> None:
-        """
-        Helper function to update p.data pointer.
-        Args:
-            output_tensor (torch.Tensor): this tensor contains the data we just gathered.
-        """
-        p.data = output_tensor
-        # Trim any padding and reshape to match original size.
-        p.data = p.data[:p._unsharded_size.numel()].view(p._unsharded_size)  # type: ignore[attr-defined]
-
-    @torch.no_grad()
-    def _rebuild_full_params(self) -> List[Tuple[torch.Tensor, bool]]:
-        """
-        Gather all shards of params.
-        """
-        # _summon_full_params must do a full precision rebuild even under mixed
-        # precision, because it is used for e.g. checkpoint where we'd like to
-        # checkpoint in full precision.
-        force_full_precision = (self.training_state == TrainingState_.SUMMON_FULL_PARAMS)
-        # full param output tensors and a flag indicating whether
-        # _summon_full_params can free them or not. It is possible that we can't
-        # free the full param, which currently occurs when the returned
-        # parameter points to the unsharded param when world_size == 1, or when
-        # we're returning the full parameter and reshard_after_forward=False
-        # (because we need to ensure p._full_param_padded stays intact)
-        output_tensors: List[Tuple[torch.Tensor, bool]] = []
-        with torch.cuda.stream(self._streams["all_gather"]):
-            for p in self.params:
-                mixed_precision_cast_ran = (
-                    self._mixed_precision_enabled_for_params()
-                    and not force_full_precision
-                )
-                if mixed_precision_cast_ran:
-                    self._cast_param_shards_to_dtype()
-                    # TODO: remove below
-                    for p in self.params:
-                        assert p.dtype == self.mixed_precision.param_dtype
-                # We can skip moving params to GPU if mixed precision, as p.data
-                # would then be pointing to p._mp_shard which is already on
-                # self.compute_device.
-                if self.cpu_offload.offload_params and not mixed_precision_cast_ran:
-                    # Move params to GPU if needed. Note that we don't use
-                    # self._full_param_padded.device here because the attr is
-                    # not set always, i.e. when world_size=1 and
-                    # p._is_sharded = False. However when it is set, the
-                    # device is always self.compute_device.
-                    p.data = p.data.to(self.compute_device, non_blocking=True)
-                # Check the validity of this `_rebuild_full_params()` call in
-                # terms of execution order (regardless of if FSDP actually
-                # needs to all-gather or not)
-                self._check_rebuild_full_params(p)
-                # e.g., when world_size == 1
-                if not p._is_sharded:  # type: ignore[attr-defined]
-                    if mixed_precision_cast_ran:
-                        # p.data should be the same type as p._mp_shard, and it
-                        # is safe to free.
-                        assert p.data.dtype == p._mp_shard.dtype
-                        # Safe to free because p.data points to the mp shard.
-                        output_tensors.append((p.data, True))
-                    else:
-                        # p.data points to the unsharded parameter, so not safe to
-                        # free.
-                        output_tensors.append((p.data, False))
-                    continue
-                # If full param has been rebuilt or has not been freed, no need to call all gather
-                elif (
-                    p._full_param_padded.storage().size()  # type: ignore[attr-defined]
-                    == p._full_param_padded.size().numel()  # type: ignore[attr-defined]
-                ):
-                    # Check that the full param is in the expected precision, if
-                    # training with mixed precision
-                    if mixed_precision_cast_ran:
-                        if p._full_param_padded.dtype != self.mixed_precision.param_dtype:
-                            raise ValueError(
-                                "_rebuild_full_params: Expected full param to be "
-                                f"of type {self.mixed_precision.param_dtype}, "
-                                f"but got {p._full_param_padded.dtype}!"
-                            )
-                    # output is full_param_padded which can be freed depending
-                    # on reshard_after_forward (this path is exercised by tests
-                    # in test_fsdp_summon_full_params).
-                    output_tensors.append((p._full_param_padded, self.reshard_after_forward))
-
-                    self._update_p_data(p, output_tensor=p._full_param_padded)  # type: ignore[attr-defined]
-                    continue
-                else:
-                    # If full param has not been rebuilt or has been freed, call all gather
-                    p_data = p.data  # type: ignore[attr-defined]
-                    p_full_size = p._full_param_padded.size()  # type: ignore[attr-defined]
-                    assert (
-                        p_full_size.numel() == p_data.numel() * self.world_size
-                    ), "Param full size should be equal to its shard size multiply world_size."
-                    assert (
-                        p._full_param_padded.storage().size() == 0  # type: ignore[attr-defined]
-                    ), "Full param's storage should have been freed before if all gather is needed."  # type: ignore[attr-defined]
-                    if (
-                        self._mixed_precision_enabled_for_params()
-                        and force_full_precision
-                    ):
-                        # p._full_param_padded has the reduced precision type,
-                        # but we need full precision rebuild as we're in
-                        # _summon_full_params. Note that this is why
-                        # _summon_full_params collects locally used params from
-                        # _rebuild_full_params instead of relying on
-                        # p._full_param_padded, as it may not always be
-                        # allocated such as during mixed precision.
-                        output_tensor = p_data.new_zeros(p_full_size)
-                    else:
-                        # Allocate based on full size from all shards.
-                        _alloc_storage(p._full_param_padded, size=p_full_size)  # type: ignore[attr-defined]
-                        output_tensor = p._full_param_padded  # type: ignore[attr-defined]
-                    # Fill output_tensor with (p.data for each shard in self.world_size)
-                    dist._all_gather_base(
-                        output_tensor, p_data, group=self.process_group
-                    )
-
-                    # The full parameter, which can be freed. Note that we
-                    # append here before update_p_data so as to not saved the
-                    # tensor with padding trimmed, which causes issues with
-                    # writeback in _summon_full_params.
-                    output_tensors.append((output_tensor, True))
-                    # Set p.data = output_tensor (with padding trimmed)
-                    self._update_p_data(p, output_tensor=output_tensor)
-                    # We can free the reduced precision shard as we have the
-                    # full precision parameter.
-                    if (
-                        self._mixed_precision_enabled_for_params()
-                    ):
-                        self._free_mp_shard(cast(List[FlatParameter], [p]))
-        return output_tensors
-
-    def _check_rebuild_full_params(self, param: FlatParameter):
-        """
-        Checks the validity of a call to :meth:`_rebuild_full_params` in terms
-        of the execution order. If on the first iteration, this uses an
-        all-gather to check that all ranks are running ``forward()`` with the
-        same parameter, erroring if not, and on subsequent iterations, if the
-        forward order differs from that of the first iteration (meaning that we
-        can no longer guarantee correct execution since all-gathers may be
-        mismatched), then we issue a warning to the user. This only issues
-        warnings on the first deviating iteration and stops checking
-        thereafter.
-
-        Only the :meth:`_rebuild_full_params` calls in the forward pass are
-        checked since a correct forward order should imply a correct
-        pre-backward order for typical cases.
-
-        Executing in ``no_sync()`` does not affect this check for
-        ``FULL_SHARD`` and ``SHARD_GRAD_OP``: (1) Being in ``no_sync()`` in the
-        first iteration does not yield a different forward
-        :meth:`_rebuild_full_params()` sequence, and (2) being in ``no_sync()``
-        in a later iteration does not give false positive warnings since the
-        forward :meth:`_rebuild_full_params()` sequence still matches the first
-        iteration sequence (for ``FULL_SHARD``) or the first iteration
-        sequence's prefix (for ``SHARD_GRAD_OP``).
-        """
-        # Only check when rebuilding the full parameters in the forward pass,
-        # and skip the check (1) when in eval mode since then there is not a
-        # safe point at which to reset the execution order data and (2) if
-        # world size is 1 since then there is no chance of desynchronization
-        if self.training_state != TrainingState_.FORWARD or \
-                not self.training or self.world_size == 1:
-            return
-        eod = self._exec_order_data
-        param_index = eod.get_param_index(param)
-        if not eod.is_first_iter:
-            # Only issue warnings on the first deviating iteration and stop
-            # checking thereafter to avoid flooding the console
-            # and if TORCH_DISTRIBUTED_DEBUG >= INFO.
-            allowed_debug_levels = [dist.DebugLevel.INFO, dist.DebugLevel.DETAIL]
-            if (
-                eod.warn_status == _ExecOrderWarnStatus.WARNED
-                or self._debug_level not in allowed_debug_levels
-            ):
-                return
-            # However, we may issue multiple warnings on the first deviating
-            # iteration to help debugging, where either:
-            # 1. This iteration sees an extra `_rebuild_full_params()` in
-            # `forward()` compared to the first iteration
-            msg_prefix = curr_param_order = None  # non-`None` means we warn
-            if eod.index >= len(eod.param_order):
-                msg_prefix = "Expected to not rebuild any more parameters " \
-                    "in `forward()` for this module but trying to rebuild " \
-                    "parameters for "
-                curr_param_order = eod.param_order + [param_index]
-            else:
-                expected_param_index = eod.param_order[eod.index]
-                # 2. This iteration sees the same number of
-                # `_rebuild_full_params()` (so far) but the current parameter
-                # differs
-                if param_index != expected_param_index:
-                    expected_param_names = eod.get_unflat_param_names(expected_param_index)
-                    assert len(expected_param_names) > 0, \
-                        "Expected parameter should always be valid"
-                    msg_prefix = "Expected to rebuild parameters in " \
-                        f"`forward()` for {expected_param_names} but " \
-                        "instead trying to rebuild parameters for "
-                    curr_param_order = eod.param_order[:eod.index - 1] + [param_index]
-            to_issue_warning = msg_prefix is not None
-            if to_issue_warning:
-                assert curr_param_order is not None
-                param_names = eod.get_unflat_param_names(param_index)
-                is_added_param = len(param_names) == 0
-                if is_added_param:
-                    msg_suffix = "a newly-added parameter since construction time"
-                else:
-                    msg_suffix = f"{param_names}"
-                sub_msg = msg_prefix + msg_suffix
-                first_iter_param_names = [
-                    eod.get_unflat_param_names(index) for index in eod.param_order
-                ]
-                curr_iter_param_names = [
-                    eod.get_unflat_param_names(index) for index in curr_param_order
-                ]
-                warnings.warn(
-                    "Forward order differs from that of the first iteration "
-                    f"on rank {self.rank} -- collectives are unchecked and may "
-                    "give incorrect results or hang\n" + sub_msg + "\n" +
-                    f"First iteration's forward order: {first_iter_param_names}"
-                    "\nThis iteration's forward order (so far): "
-                    f"{curr_iter_param_names}"
-                )
-                eod.warn_status = _ExecOrderWarnStatus.WARNING
-            eod.index += 1
-        else:
-            # Use `compute_device` instead of the parameter's device in case it
-            # is offloaded on CPU and we are using NCCL backend, which requires
-            # communicated tensors be on GPU
-            device = self.compute_device
-            indices = torch.zeros(self.world_size, dtype=torch.int32, device=device)
-            index = torch.tensor([param_index], dtype=torch.int32, device=device)
-            dist._all_gather_base(indices, index, group=self.process_group)
-            # Check that all ranks plan to all-gather the same parameter index
-            for (r1, i1), (r2, i2) in itertools.combinations(
-                ((rank, indices[rank]) for rank in range(self.world_size)), 2,
-            ):
-                if not torch.equal(i1, i2):
-                    r1_param_names = eod.get_unflat_param_names(i1)
-                    r2_param_names = eod.get_unflat_param_names(i2)
-                    raise RuntimeError(
-                        f"Forward order differs across ranks: rank {r1} is "
-                        "rebuilding full parameters in `forward()` for "
-                        f"{r1_param_names} while rank {r2} is rebuilding full "
-                        f"parameters in `forward()` for {r2_param_names}"
-                    )
-            eod.param_order.append(param_index)
-
-    @torch.no_grad()
-    def _prep_grads_for_backward(self) -> None:
-        """Make sure p.grad has the correct size/device, otherwise set it to None."""
-        for p in self.params:
-            if p.grad is not None and (
-                p.grad.size() != p._unsharded_size  # type: ignore[attr-defined]
-                or p.grad.device != p.device
-            ):
-                offloaded: bool = p.grad.device != p.device
-                if offloaded:
-                    assert self.cpu_offload.offload_params, \
-                        "`p.grad.device` and `p.device` should be the same " \
-                        "if not offloading parameters to CPU"
-                prev_iter_outside_no_sync: bool = \
-                    p.grad.size() == p._local_shard.shape  # type: ignore[attr-defined]
-                # As long as the previous iteration was outside `no_sync()`,
-                # then we must save the gradient in `_saved_grad_shard`, even
-                # if the current iteration is inside `no_sync()`. This is to
-                # prepare for the next iteration outside `no_sync()`, which may
-                # try to accumulate gradients. FSDP accumulates gradients in
-                # the separate variable `p._saved_grad_shard` to leave `p.grad`
-                # for the per-iteration gradient.
-                if prev_iter_outside_no_sync:
-                    # FSDP currently does not support gradient accumulation
-                    # outside `no_sync()` when using CPU offloading (see the
-                    # warning in the class's docstring).
-                    if not offloaded:
-                        p._saved_grad_shard = p.grad.data  # type: ignore[attr-defined]
-                p.grad = None
-
-    def _should_free_full_params(self):
-        return (
-            self.sharding_strategy == ShardingStrategy.FULL_SHARD
-            # Optimization where we don't reshard if running Zero-2
-            # in no_sync().
-            or self._sync_gradients
-        )
-
-    @torch.no_grad()
-    def _free_full_params(self, params: Optional[List[FlatParameter]] = None) -> None:
-        """
-        Free up storage for full parameters.
-        """
-        if params is None:
-            params = self.params
-        current_stream = torch.cuda.current_stream()
-        for p in params:
-            # e.g., world_size == 1 or self.sharding_strategy = NO_SHARD
-            if not p._is_sharded:  # type: ignore[attr-defined]
-                if (
-                    self._mixed_precision_enabled_for_params()
-                ):
-                    self._free_mp_shard(cast(List[FlatParameter], [p]))
-                continue
-            # Don't let PyTorch reuse this memory until all work in the current
-            # stream is complete.
-            p._full_param_padded.record_stream(current_stream)  # type: ignore[attr-defined]
-            # There may be external references to the Tensor Storage that we
-            # can't modify, such as references that are created by
-            # ctx.save_for_backward in the forward pass. Thus when we
-            # unshard parameters, we should reuse the original Tensor
-            # Storage object and unshard it in-place. For now, just resize
-            # the Storage to 0 to save memory.
-            _free_storage(p._full_param_padded)  # type: ignore[attr-defined]
-
-    @torch.no_grad()
-    def _use_param_local_shard(
-        self, params: Optional[List[FlatParameter]] = None
-    ) -> None:
-        """Use local shard for a list of params. Also implicitly offloads
-        parameters back to CPU if we are CPU offloading."""
-        if params is None:
-            params = self.params
-        for p in params:
-            if self.cpu_offload.offload_params:
-                # Ensure local_shard resides in CPU if we are offloading params.
-                assert p._local_shard.device == torch.device(  # type: ignore[attr-defined]
-                    "cpu"
-                ), "Expected p._local_shard to be on CPU"
-            p.data = p._local_shard  # type: ignore[attr-defined]
-
     def _assert_state(self, state: Union[TrainingState_, List[TrainingState_]]) -> None:
         """Assert we are in the given state."""
         # Since assert can be turned off and this error checking
@@ -3632,6 +3587,25 @@ def clip_grad_norm_(
                 assert p.grad is not None
                 p.grad.detach().mul_(clip_coef.to(p.grad.device))
 
+    @staticmethod
+    def _warn_optim_input(optim_input):
+        if optim_input is not None:
+            warnings.warn(
+                "The `optim_input` argument is deprecated. You may remove it "
+                "from your code without changing its functionality."
+            )
+
+    @staticmethod
+    def _is_using_optim_input(optim_input, optim) -> bool:
+        if optim_input is None and optim is None:
+            # Use the default behavior of `optim_input``
+            return True
+        if optim_input is not None:
+            # Use the `optim_input` code path
+            return True
+        # Use the `optim` code path
+        return False
+
     @staticmethod
     def full_optim_state_dict(
         model: torch.nn.Module,
@@ -3657,10 +3631,6 @@ def full_optim_state_dict(
         .. warning:: Unlike ``torch.optim.Optimizer.state_dict()``, this method
             uses full parameter names as keys instead of parameter IDs.
 
-        .. warning:: If you do not pass ``model.parameters()`` as the first
-            argument to the optimizer, then you should pass that same value to
-            this method as ``optim_input``.
-
         .. note:: Like in :meth:`torch.optim.Optimizer.state_dict`, the tensors
             contained in the optimizer state dict are not cloned, so there may
             be aliasing surprises. For best practices, consider saving the
@@ -3677,7 +3647,8 @@ def full_optim_state_dict(
                 Input passed into the optimizer ``optim`` representing either a
                 :class:`list` of parameter groups or an iterable of parameters;
                 if ``None``, then this method assumes the input was
-                ``model.parameters()``. (Default: ``None``)
+                ``model.parameters()``. This argument is deprecated, and there
+                is no need to pass it in anymore. (Default: ``None``)
             rank0_only (bool): If ``True``, saves the populated :class:`dict`
                 only on rank 0; if ``False``, saves it on all ranks. (Default:
                 ``True``)
@@ -3691,6 +3662,10 @@ def full_optim_state_dict(
             :meth:`torch.optim.Optimizer.state_dict`. If ``rank0_only=True``,
             then nonzero ranks return an empty :class:`dict`.
         """
+        FullyShardedDataParallel._warn_optim_input(optim_input)
+        using_optim_input = FullyShardedDataParallel._is_using_optim_input(
+            optim_input, optim,
+        )
         return _optim_state_dict(
             model=model,
             optim=optim,
@@ -3698,6 +3673,7 @@ def full_optim_state_dict(
             rank0_only=rank0_only,
             shard_state=False,
             group=group,
+            using_optim_input=using_optim_input,
         )
 
     @staticmethod
@@ -3722,7 +3698,10 @@ def sharded_optim_state_dict(
         .. warning:: The returned state dict contains ``ShardedTensor`` and
             cannot be directly used by the regular ``optim.load_state_dict``.
         """
-
+        FullyShardedDataParallel._warn_optim_input(optim_input)
+        using_optim_input = FullyShardedDataParallel._is_using_optim_input(
+            optim_input, optim,
+        )
         # TODO: The ultimate goal of the optimizer state APIs should be the same
         # as state_dict/load_state_dict -- using one API to get optimizer states
         # and one API to load optimizer states. ``state_dict_type`` will be used
@@ -3738,15 +3717,19 @@ def sharded_optim_state_dict(
             rank0_only=False,
             shard_state=True,
             group=group,
+            using_optim_input=using_optim_input,
         )
 
     @staticmethod
     def shard_full_optim_state_dict(
         full_optim_state_dict: Dict[str, Any],
         model: torch.nn.Module,
-        optim_input: Optional[Union[
-            List[Dict[str, Any]], Iterable[torch.nn.Parameter],
-        ]] = None,
+        optim_input: Optional[
+            Union[
+                List[Dict[str, Any]], Iterable[torch.nn.Parameter],
+            ]
+        ] = None,
+        optim: Optional[torch.optim.Optimizer] = None,
     ) -> Dict[str, Any]:
         """
         Shards the full optimizer state dict ``full_optim_state_dict`` by
@@ -3768,10 +3751,6 @@ def shard_full_optim_state_dict(
             >>> sharded_osd = FSDP.shard_full_optim_state_dict(full_osd, new_model)
             >>> new_optim.load_state_dict(sharded_osd)
 
-        .. warning:: If you do not pass ``model.parameters()`` as the first
-            argument to the optimizer, then you should pass that same value to
-            this method as ``optim_input``.
-
         .. note:: Both :meth:`shard_full_optim_state_dict` and
             :meth:`scatter_full_optim_state_dict` may be used to get the
             sharded optimizer state dict to load. Assuming that the full
@@ -3795,17 +3774,27 @@ def shard_full_optim_state_dict(
                 Input passed into the optimizer representing either a
                 :class:`list` of parameter groups or an iterable of parameters;
                 if ``None``, then this method assumes the input was
-                ``model.parameters()``. (Default: ``None``)
+                ``model.parameters()``. This argument is deprecated, and there
+                is no need to pass it in anymore. (Default: ``None``)
+            optim (Optional[torch.optim.Optimizer]): Optimizer that will load
+                the state dict returned by this method. This is the preferred
+                argument to use over ``optim_input``. (Default: ``None``)
 
         Returns:
             Dict[str, Any]: The full optimizer state dict now remapped to
             flattened parameters instead of unflattened parameters and
             restricted to only include this rank's part of the optimizer state.
         """
+        FullyShardedDataParallel._warn_optim_input(optim_input)
+        using_optim_input = FullyShardedDataParallel._is_using_optim_input(
+            optim_input, optim,
+        )
         sharded_osd = _flatten_optim_state_dict(
             full_optim_state_dict, model, True,
         )
-        return _rekey_sharded_optim_state_dict(sharded_osd, model, optim_input)
+        return _rekey_sharded_optim_state_dict(
+            sharded_osd, model, optim, optim_input, using_optim_input,
+        )
 
     @staticmethod
     def flatten_sharded_optim_state_dict(
@@ -3816,6 +3805,7 @@ def flatten_sharded_optim_state_dict(
                 List[Dict[str, Any]], Iterable[torch.nn.Parameter],
             ]
         ] = None,
+        optim: Optional[torch.optim.Optimizer] = None,
     ) -> Dict[str, Any]:
         """
         The API is similar to :meth:`shard_full_optim_state_dict`. The only
@@ -3829,13 +3819,14 @@ def flatten_sharded_optim_state_dict(
                 sharded optimizer state.
             model (torch.nn.Module):
                 Refer to :meth:``shard_full_optim_state_dict``.
-            optim_input (Optional[Union[List[Dict[str, Any]], Iterable[torch.nn.Parameter]]]):
-                Refer to :meth:``shard_full_optim_state_dict``.
 
         Returns:
             Refer to :meth:`shard_full_optim_state_dict`.
         """
-
+        FullyShardedDataParallel._warn_optim_input(optim_input)
+        using_optim_input = FullyShardedDataParallel._is_using_optim_input(
+            optim_input, optim,
+        )
         # TODO: The implementation is the same as ``shard_full_optim_state_dict``.
         # See the TODO in ``shard_full_optim_state_dict`` for the future
         # unification plan.
@@ -3844,7 +3835,9 @@ def flatten_sharded_optim_state_dict(
             model=model,
             shard_state=True,
         )
-        return _rekey_sharded_optim_state_dict(flattened_osd, model, optim_input)
+        return _rekey_sharded_optim_state_dict(
+            flattened_osd, model, optim, optim_input, using_optim_input,
+        )
 
     @staticmethod
     def scatter_full_optim_state_dict(
@@ -3853,6 +3846,7 @@ def scatter_full_optim_state_dict(
         optim_input: Optional[Union[
             List[Dict[str, Any]], Iterable[torch.nn.Parameter],
         ]] = None,
+        optim: Optional[torch.optim.Optimizer] = None,
         group: Optional[Any] = None,
     ) -> Dict[str, Any]:
         """
@@ -3897,7 +3891,11 @@ def scatter_full_optim_state_dict(
                 Input passed into the optimizer representing either a
                 :class:`list` of parameter groups or an iterable of parameters;
                 if ``None``, then this method assumes the input was
-                ``model.parameters()``. (Default: ``None``)
+                ``model.parameters()``. This argument is deprecated, and there
+                is no need to pass it in anymore. (Default: ``None``)
+            optim (Optional[torch.optim.Optimizer]): Optimizer that will load
+                the state dict returned by this method. This is the preferred
+                argument to use over ``optim_input``. (Default: ``None``)
             group (dist.ProcessGroup): Model's process group or ``None`` if
                 using the default process group. (Default: ``None``)
 
@@ -3906,6 +3904,10 @@ def scatter_full_optim_state_dict(
             flattened parameters instead of unflattened parameters and
             restricted to only include this rank's part of the optimizer state.
         """
+        FullyShardedDataParallel._warn_optim_input(optim_input)
+        using_optim_input = FullyShardedDataParallel._is_using_optim_input(
+            optim_input, optim,
+        )
         # Try to use the passed-in process group, the model's process group,
         # or the default process group (i.e. `None`) in that priority order
         if group is None and hasattr(model, "process_group"):
@@ -3942,8 +3944,10 @@ def scatter_full_optim_state_dict(
             group, broadcast_device,
         )
         # Rekey the optimizer state dict to use parameter IDs according to this
-        # rank's `optim_input`
-        sharded_osd = _rekey_sharded_optim_state_dict(sharded_osd, model, optim_input)
+        # rank's `optim`
+        sharded_osd = _rekey_sharded_optim_state_dict(
+            sharded_osd, model, optim, optim_input, using_optim_input,
+        )
         return sharded_osd
 
     @staticmethod
@@ -3954,6 +3958,7 @@ def rekey_optim_state_dict(
         optim_input: Optional[Union[
             List[Dict[str, Any]], Iterable[torch.nn.Parameter],
         ]] = None,
+        optim: Optional[torch.optim.Optimizer] = None,
     ) -> Dict[str, Any]:
         """
         Re-keys the optimizer state dict ``optim_state_dict`` to use the key
@@ -3987,8 +3992,13 @@ def rekey_optim_state_dict(
             Dict[str, Any]: The optimizer state dict re-keyed using the
             parameter keys specified by ``optim_state_key_type``.
         """
-        assert optim_state_key_type in \
-            (OptimStateKeyType.PARAM_NAME, OptimStateKeyType.PARAM_ID)
+        FullyShardedDataParallel._warn_optim_input(optim_input)
+        using_optim_input = FullyShardedDataParallel._is_using_optim_input(
+            optim_input, optim,
+        )
+        assert optim_state_key_type in (
+            OptimStateKeyType.PARAM_NAME, OptimStateKeyType.PARAM_ID,
+        )
         osd = optim_state_dict  # alias
         # Validate that the existing parameter keys are uniformly typed
         uses_param_name_mask = [
@@ -3997,8 +4007,10 @@ def rekey_optim_state_dict(
         uses_param_id_mask = [
             type(param_key) is int for param_key in osd["state"]
         ]
-        if (any(uses_param_name_mask) and not all(uses_param_name_mask)) or \
-                (any(uses_param_id_mask) and not all(uses_param_id_mask)):
+        if (
+            (any(uses_param_name_mask) and not all(uses_param_name_mask))
+            or (any(uses_param_id_mask) and not all(uses_param_id_mask))
+        ):
             error_msg = f"Invalid parameter keys: {osd['state'].keys()}"
             raise ValueError(error_msg)
         # Return directly if the existing key type matches the target key type
@@ -4010,7 +4022,11 @@ def rekey_optim_state_dict(
         # Otherwise, actually perform the re-keying
         new_osd = {}
         if optim_state_key_type == OptimStateKeyType.PARAM_NAME:  # ID -> name
-            param_id_to_param = _get_param_id_to_param(model, optim_input)
+            param_id_to_param = (
+                _get_param_id_to_param_from_optim_input(model, optim_input)
+                if using_optim_input
+                else _get_param_id_to_param(optim)
+            )
             param_to_param_name = _get_param_to_param_name(model)
             param_id_to_param_name: List[str] = [
                 param_to_param_name[param] for param in param_id_to_param
@@ -4028,7 +4044,11 @@ def rekey_optim_state_dict(
             return new_osd
         elif optim_state_key_type == OptimStateKeyType.PARAM_ID:  # name -> ID
             param_name_to_param = _get_param_name_to_param(model)
-            param_to_param_id = _get_param_to_param_id(model, optim_input)
+            param_to_param_id = (
+                _get_param_to_param_id_from_optim_input(model, optim_input)
+                if using_optim_input
+                else _get_param_to_param_id(optim)
+            )
             # Because not all model parameters may be passed as the optimizer
             # input, we may need to drop some parameters from this mapping
             param_name_to_param_id = {
@@ -4202,19 +4222,6 @@ def _is_param_exec_order_prep_stage(self) -> bool:
                 ), "When not in execution order prep stage, all _params_exec_order_hook_handle should be removed."
         return is_prep_stage
 
-def _get_default_cuda_device(module: nn.Module) -> torch.device:
-    """Try to infer CUDA device from module parameters."""
-    try:
-        compute_device = next(module.parameters()).device
-        if compute_device.type == "cuda":
-            return compute_device
-    # e.g., if module does not have parameters, it will throw StopIteration,
-    # in this case, instead of raising exception, return cuda device.
-    except StopIteration:
-        pass
-    # Fall back to current CUDA device
-    return torch.device("cuda", torch.cuda.current_device())
-
 
 def _calc_grad_norm(parameters: List[torch.nn.Parameter], p: float) -> torch.Tensor:
     r"""Calculate gradient norm of an iterable of parameters.
@@ -4269,7 +4276,7 @@ def module_fn(module, prefix, param_to_unflat_param_names):
         if not isinstance(module, FullyShardedDataParallel):
             for param_name, param in module.named_parameters(recurse=False):
                 module_prefixed_param_names = (
-                    param._prefixed_param_names if isinstance(param, FlatParameter)
+                    param._prefixed_param_names if type(param) is FlatParameter
                     else [param_name]
                 )  # prefixed from `module`
                 fully_prefixed_param_names = [
@@ -4333,10 +4340,9 @@ def _get_param_name_to_param(
 
 
 def clean_tensor_name(tensor_name: str) -> str:
-    """Cleans the parameter or buffer name by removing any FSDP-related
+    """Cleans the parameter or buffer name by removing any module wrapper
     prefixes."""
-    # FSDP full tensor names may not have both (i.e. `FSDP_PREFIX`), so we
-    # call `replace()` twice separately
+    # Call `replace()` twice separately since the name may not have both
     tensor_name = tensor_name.replace(FSDP_WRAPPED_MODULE + ".", "")
     tensor_name = tensor_name.replace(FPW_MODULE + ".", "")
     # TODO: Explicitly replacing checkpoint_wrapper prefix is not ideal,
diff --git a/torch/distributed/utils.py b/torch/distributed/utils.py
index 4dea0ce82884..c559577cc8ae 100644
--- a/torch/distributed/utils.py
+++ b/torch/distributed/utils.py
@@ -4,10 +4,47 @@
 from torch.nn.parallel.scatter_gather import (  # type: ignore[attr-defined]
     _is_namedtuple
 )
-from typing import Dict, Any, List
+from typing import Any, Dict, List, Tuple
 
 __all__ = []  # type: ignore[var-annotated]
 
+def _pack_kwargs(*args: Any, **kwargs: Any) -> Tuple[Tuple[Any, ...], Tuple[str, ...]]:
+    """
+    Turn argument list into separate key list and value list (unpack_kwargs does the opposite)
+    Inspiration: https://github.com/facebookresearch/fairscale/blob/eeb6684/fairscale/internal/containers.py#L70
+    Usage::
+
+        kwarg_keys, flat_args = pack_kwargs(1, 2, a=3, b=4)
+        assert kwarg_keys == ("a", "b")
+        assert flat_args == (1, 2, 3, 4)
+        args, kwargs = unpack_kwargs(kwarg_keys, flat_args)
+        assert args == (1, 2)
+        assert kwargs == {"a": 3, "b": 4}
+    Returns:
+        Tuple[Tuple[Any, ...], Tuple[str, ...]]: The first tuple element gives
+        gives both positional args and kwarg values, where the positional args
+        proceed kwarg values and kwarg values are ordered consistently with the
+        kwarg keys. The second tuple element gives the kwarg keys.
+        The second tuple element's length is at most the first tuple element's length.
+    """
+    kwarg_keys: List[str] = []
+    flat_args: List[Any] = list(args)
+    for k, v in kwargs.items():
+        kwarg_keys.append(k)
+        flat_args.append(v)
+
+    return tuple(flat_args), tuple(kwarg_keys)
+
+
+def _unpack_kwargs(flat_args: Tuple[Any, ...], kwarg_keys: Tuple[str, ...]) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
+    """See _pack_kwargs."""
+    assert len(kwarg_keys) <= len(flat_args), f"too many keys {len(kwarg_keys)} vs. {len(flat_args)}"
+    if len(kwarg_keys) == 0:
+        return flat_args, {}
+    args = flat_args[: -len(kwarg_keys)]
+    kwargs = {k: v for k, v in zip(kwarg_keys, flat_args[-len(kwarg_keys) :])}
+    return args, kwargs
+
 def _recursive_to(inputs, target_gpu, use_side_stream_for_tensor_copies):
     r"""
     Recursively moves input to the target_gpu.
diff --git a/torch/fx/_symbolic_trace.py b/torch/fx/_symbolic_trace.py
index a1939a389ab1..ff9df1161a70 100644
--- a/torch/fx/_symbolic_trace.py
+++ b/torch/fx/_symbolic_trace.py
@@ -992,9 +992,9 @@ def my_custom_function(x, y):
             "string name"
         )
 
-    if hasattr(fn_or_name, "__code__"):
+    if callable(fn_or_name):
         assert not isinstance(fn_or_name, str)  # to make mypy happy
-        fn_name = fn_or_name.__code__.co_name
+        fn_name = fn_or_name.__name__
     else:
         assert isinstance(
             fn_or_name, str
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index 528d8b3c7376..e5e20f01528a 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -10,6 +10,7 @@
 import torch.utils._pytree as pytree
 from torch.fx import Tracer, GraphModule
 from torch._subclasses.fake_tensor import FakeTensorMode
+from torch._dispatch.python import enable_python_dispatcher
 import torch.fx as fx
 from torch.fx.passes.shape_prop import _extract_tensor_metadata
 from contextlib import contextmanager, nullcontext
@@ -20,7 +21,6 @@
 from torch.utils._python_dispatch import TorchDispatchMode, enable_torch_dispatch_mode
 from torch._subclasses import FakeTensor
 from .symbolic_shapes import ShapeEnv, SymDispatchMode, PySymInt, PySymFloat
-import torch.fx.experimental.symbolic_shapes as symbolic_shapes
 from torch.fx import Proxy
 
 __all__ = ["PythonKeyTracer", "dispatch_trace", "make_fx", "DecompositionInterpreter", "get_proxy", "has_proxy"]
@@ -104,25 +104,35 @@ def set_meta(proxy, val):
             proxy.node.meta['tensor_meta'] = _extract_tensor_metadata(val)
     return proxy
 
+def thunkify(f, *args, **kwargs):
+    """
+    Delays computation of f until it's called again
+    Also caches the result
+    """
+    return functools.lru_cache(1)(functools.partial(f, *args, **kwargs))
 
 def track_tensor(tensor, proxy, *, constant, tracer):
+    def try_set_proxy_slot(outer_s, proxy_callable, *args):
+        assert callable(proxy_callable)
+        if isinstance(outer_s, SymInt):
+            inner_s = outer_s.get_pyobj()
+            assert isinstance(inner_s, PySymInt)
+
+            set_proxy_slot(inner_s, tracer, thunkify(proxy_callable, inner_s, *args))
+
     # The basic idea is that we need to associate each tensor/SymInt
     # with a Proxy.  How do we setup this association?  We just store
     # the proxy on the proxy slot of the object, keyed on the tracer
     # (so that if we have multiple tracers at the same time, they
     # don't clobber each other.)
     for i, s in enumerate(tensor.shape):
-        if isinstance(s, SymInt):
-            inner_s = s.get_pyobj()
-            assert isinstance(inner_s, PySymInt)
-            # TODO: improve naming
-            # TODO: lazily insert this into the graph only on first
-            # use?  Maybe complicated and DCE is a better idea
-            s_proxy = torch.ops.aten.sym_size(proxy, i)
-            set_meta(s_proxy, inner_s)
-            set_proxy_slot(inner_s, tracer, s_proxy)
-
-        # TODO: also do stride/numel
+        try_set_proxy_slot(s, lambda x, i: set_meta(torch.ops.aten.sym_size(proxy, i), x), i)
+
+    for i, s in enumerate(tensor.stride()):
+        try_set_proxy_slot(s, lambda x, i: set_meta(torch.ops.aten.sym_stride(proxy, i), x), i)
+
+    try_set_proxy_slot(tensor.numel(), lambda x: set_meta(torch.ops.aten.sym_numel(proxy), x))
+    try_set_proxy_slot(tensor.storage_offset(), lambda x: set_meta(torch.ops.aten.sym_storage_offset(proxy), x))
     set_proxy_slot(tensor, tracer, _ProxyTensor(proxy, constant))
 
 def track_tensor_tree(inner_res, proxy_res, *, constant, tracer):
@@ -176,7 +186,7 @@ def inner(e):
             return n.constant
         else:
             # NB: we REQUIRE all symints to be tracked
-            return get_proxy_slot(n, tracer)
+            return get_proxy_slot(n, tracer)()
     return inner
 
 
@@ -200,17 +210,12 @@ def can_handle_tensor(x):
             if r is not NotImplemented:
                 return r
 
-    # Some of these are not "real" aten ops and will fail if we
-    # call _dispatch_has_kernel_for_dispatch_key on them.
-    # This list is probably incomplete
-    if func not in [torch.ops.aten.size.default]:
-        with proxy_mode.restore():
-            r = func.decompose(*args, **kwargs)
-            if r is not NotImplemented:
-                return r
+    with proxy_mode.restore():
+        r = func.decompose(*args, **kwargs)
+        if r is not NotImplemented:
+            return r
 
     tracer = proxy_mode.tracer
-
     f_args, f_kwargs = pytree.tree_map_only(torch.Tensor, fetch_tensor_proxy(tracer), (args, kwargs))
 
     # If there are SymInts, we also should not consider this constant.
@@ -235,7 +240,6 @@ def can_handle_tensor(x):
             "It appears that you're trying to get value out of a tracing tensor - erroring out! "
             "It's likely that this is caused by data-dependent control flow or similar."
         )
-
     proxy_args, proxy_kwargs = pytree.tree_map_only(
         (SymInt, SymFloat),
         fetch_sym_proxy(proxy_mode.tracer),
@@ -434,24 +438,10 @@ def inner_torch_dispatch(self, func, types, args=(), kwargs=None):
         if not self.enable_tracing:
             return func(*args, **kwargs)
 
-        if symbolic_shapes.is_symbolic_op(func):
-            with self.restore():
-                return symbolic_shapes.handle_symbolic_op(func, args, kwargs)
-
         if func in [prim.device.default]:
             return func(*args, **kwargs)
 
-        out = proxy_call(self, func, args, kwargs)
-
-        def assert_proxy_tensor(e):
-            assert has_proxy_slot(e, self.tracer), \
-                f"Internal Error: make_fx is incorrectly baking a tensor constant into the graph: {str(e)}"
-
-        # When we trace factory functions, we expect that tensor outputs are *always* tracked.
-        # (Except for torch.tensor() constants handled through lift(), which is handled
-        # specially further up).
-        pytree.tree_map_only(torch.Tensor, assert_proxy_tensor, out)
-        return out
+        return proxy_call(self, func, args, kwargs)
 
 
 SymInt = torch.SymIntNode
@@ -476,24 +466,33 @@ def enable(self, b):
         finally:
             self.enable_tracing = old
 
-    def __sym_dispatch__(self, func, types, args, kwargs):
-        if not self.enable_tracing:
-            return func(*args, **kwargs)
-        p_args, p_kwargs = pytree.tree_map_only(
-            (PySymInt, PySymFloat),
-            lambda s: get_proxy_slot(s, self.tracer) if s.constant is None else s.constant,
-            (args, kwargs)
+    def _compute_proxy(self, func, args, out):
+        n_args = tuple(
+            get_proxy_slot(a, self.tracer)().node if a.constant is None else a.constant
+            if isinstance(a, (PySymInt, PySymFloat)) else a
+            for a in args
         )
+
         # func doesn't have a __torch_function__ that Proxy can interpose, so
         # we gotta do it manually
-        n_args, n_kwargs = pytree.tree_map_only(fx.Proxy, lambda p: p.node, (p_args, p_kwargs))
-
-        n_out = self.tracer.create_node("call_function", func, n_args, n_kwargs)
+        n_out = self.tracer.create_node("call_function", func, n_args, {})
         p_out = fx.Proxy(n_out, self.tracer)
-        out = func(*args, **kwargs)
         set_meta(p_out, out)
+        return p_out
+
+    def __sym_dispatch__(self, func, types, args, kwargs):
+        if not self.enable_tracing:
+            return func(*args, **kwargs)
+        # For speed, we assume there are no nested data structures
+        # (otherwise we could use tree_map)
+        # We also assume there are no keyword arguments.
+        assert not kwargs
+        out = func(*args, **kwargs)
         assert isinstance(out, (PySymInt, PySymFloat)), f"{func}(*{args}, **{kwargs}) = {out}"
-        set_proxy_slot(out, self.tracer, p_out)
+
+        # Delays tracing out the proxies on this op until we actually need it
+        p_out_thunk = thunkify(self._compute_proxy, func=func, args=args, out=out)
+        set_proxy_slot(out, self.tracer, p_out_thunk)
         return out
 
 
@@ -579,6 +578,10 @@ def wrapped(*args):
         else:
             raise AssertionError(f"Unexpected tracing type: {tracing_mode}")
 
+        python_dispatcher_mode: Any = nullcontext()
+        if tracing_mode == "symbolic":
+            python_dispatcher_mode = enable_python_dispatcher()
+
         proxy_mode = ProxyTorchDispatchMode(fx_tracer)
 
         def wrap_fake_concrete(x):
@@ -617,7 +620,7 @@ def wrap_fake_symbolic(x, sym_shape):
 
         # We disable the autocast cache as the autocast cache causes type conversions on parameters to
         # check a cache, which introduces untracked tensors into the graph
-        with decompose(decomposition_table), fake_tensor_mode, \
+        with decompose(decomposition_table), fake_tensor_mode, python_dispatcher_mode, \
              sym_mode, proxy_mode, disable_autocast_cache():  # type: ignore[attr-defined]
             t = dispatch_trace(wrap_key(func, args, fx_tracer), tracer=fx_tracer, concrete_args=tuple(phs))
 
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 4e5e3e9f8b0a..e93894518c49 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -1,7 +1,11 @@
 import torch
 import torch.utils._pytree as pytree
-from typing import Dict, Any, List, Type
+from typing import Dict, List, Type, Optional, cast
 import operator
+import functools
+from functools import lru_cache
+import traceback
+import collections
 
 try:
     import sympy  # type: ignore[import]
@@ -9,10 +13,10 @@
 except ImportError:
     HAS_SYMPY = False
 
-aten = torch.ops.aten
+aten = torch.ops.aten  # type: ignore[has-type]
 
 __all__ = [
-    "has_symbolic_sizes_strides", "create_contiguous", "is_symbolic_op", "handle_symbolic_op", "PySymInt", "ShapeEnv",
+    "has_symbolic_sizes_strides", "create_contiguous", "PySymInt", "ShapeEnv",
     "SymDispatchMode", "PySymFloat", "sym_float"
 ]
 
@@ -59,7 +63,11 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         SYM_FUNCTION_MODE = self.inner
 
 def has_symbolic_sizes_strides(elem):
-    return any([isinstance(i, torch._C.SymIntNode) for i in elem.shape])
+    return (
+        any([isinstance(i, torch.SymIntNode) for i in elem.shape])
+        or any([isinstance(i, torch.SymIntNode) for i in elem.stride()])
+        or isinstance(elem.storage_offset(), torch.SymIntNode)
+    )
 
 def create_contiguous(shape):
     strides = [1]
@@ -67,28 +75,6 @@ def create_contiguous(shape):
         strides.append(dim * strides[-1])
     return list(reversed(strides))
 
-def is_symbolic_op(func):
-    return func in [aten.sym_size.default, aten.dim.default,
-                    aten.is_contiguous.default, aten.sym_stride.default, aten.sym_numel.default
-                    ]
-
-def handle_symbolic_op(func, args, kwargs):
-    assert is_symbolic_op(func)
-    if func == torch.ops.aten.sym_size.default:
-        return None
-    if func == torch.ops.aten.sym_stride.default:
-        return None
-    if func == torch.ops.aten.dim.default:
-        return len(args[0].shape)
-    if func == torch.ops.aten.sym_numel.default:
-        res = 1
-        for s in args[0].shape:
-            res = res * s
-        return res
-    # TODO: hack, need to make is_contiguous calls symbolic (probably through computing on symbolic strides)
-    if func == torch.ops.aten.is_contiguous.default:
-        return True
-
 def _handle_sym_dispatch(func, args, kwargs):
     global SYM_FUNCTION_MODE
     mode = SYM_FUNCTION_MODE
@@ -130,10 +116,15 @@ def __repr__(self):
         return f"{self.expr}"
 
     # Today we error on calling int on a symbolic shape, as this is a very accessible footgun.
-    # In the future we'll probably need some explicit way of allowing this
     def __int__(self):
         raise RuntimeError("Trying to extract a concrete int out of a symbolic int")
 
+    # You can manually trigger a guard with this function
+    def guard_int(self, file, line):
+        # TODO: use the file/line for some useful diagnostic on why a
+        # guard occurred
+        return int(self.shape_env.evaluate_expr(self.expr))
+
     def __sym_float__(self):
         if SYM_FUNCTION_MODE:
             return _handle_sym_dispatch(sym_float, (self,), {})
@@ -143,7 +134,7 @@ def __sym_float__(self):
         return PySymFloat(self.expr, self.shape_env)
 
     def __bool__(self):
-        return bool(self.shape_env.evaluate_expr(self.expr))
+        return bool(self.shape_env.evaluate_expr(self.shape_env.replace(self.expr)))
 
 class PySymFloat:
     def __init__(self, expr, shape_env, constant=None):
@@ -157,14 +148,13 @@ def wrap(self, num):
     def __str__(self):
         return f"{self.expr}"
 
-
 # Methods that have a `__foo__` as well as `__rfoo__`
 reflectable_magic_methods = {
     'add': lambda a, b: a + b,
     'sub': lambda a, b: a - b,
     'mul': lambda a, b: a * b,
     'mod': lambda a, b: a % b,
-    'floordiv': lambda a, b: sympy.floor(a / b),
+    'floordiv': lambda a, b: (a - (a % b)) / b
 }
 
 magic_methods = {
@@ -186,48 +176,79 @@ def magic_impl(self, other):
             if isinstance(other, PySymInt):
                 other = other.expr
             # TODO: consider constant prop here
-            return PySymInt(func(self.expr, other), self.shape_env)
+            expr = self.shape_env.replace(self.expr)
+            other = self.shape_env.replace(other)
+            out = func(expr, other)
+            out = sympy.expand(out)
+            return PySymInt(out, self.shape_env)
         return magic_impl
 
-    # this should be wrapped transparently into torch._C.SymIntNode
+    _func = lru_cache(256)(_func)
+    # this should be wrapped transparently into torch.SymIntNode
     setattr(PySymInt, method, _create_magic_impl(_func))
     setattr(PySymInt, f"__{method}__", _create_magic_impl(_func))
     if method in reflectable_magic_methods:
         setattr(PySymInt, f"__r{method}__", _create_magic_impl(_func))
 
+def _lru_cache(fn, maxsize=None):
+    """
+    Wrapper around lru_cache that clears when new info about shapes has been
+    updated.
+
+    Use lru_cache if the output is always the same, regardless of the
+    constraints we know now (i.e. evaluate_expr)
+
+    Use _lru_cache otherwise.
+    """
+    fn_cache = lru_cache(maxsize)(fn)
+    prior_key = None
+
+    @functools.wraps(fn)
+    def wrapper(self, *args, **kwargs):
+        nonlocal prior_key
+        if prior_key != self._get_key():
+            prior_key = self._get_key()
+            fn_cache.cache_clear()
+        return fn_cache(self, *args, **kwargs)
+
+    wrapper.cache_info = fn_cache.cache_info  # type: ignore[attr-defined]
+    return wrapper
+
+
+
 class ShapeEnv(object):
     def __init__(self):
         self.guards = []
-        self.shape_env = {}
-
-    def create_symint(self, name, val, shape_env=None):
+        # Maps symbolic ints to their original concrete values
+        # Currently populated from tensors
+        self.var_to_val: Dict["sympy.Symbol", "sympy.Integer"] = {}
+        # Maps from sympy ints to expressions representing them
+        # Populated from equality guards (i.e. a.shape[0] == b.shape[0])
+        self.replacements: Dict["sympy.Symbol", "sympy.Expr"] = {}  #
+        # Keys are Mod(x, y), values are 0 (for ease of substitution)
+        self.divisible: Dict["sympy.Expr", "sympy.Integer"] = {}
+
+    def _get_key(self):
+        """
+        Defines the current "state" of the guards we've accumulated in this ShapeEnv.
+        Determines when we need to invalidate our cache
+        """
+        return (len(self.replacements), len(self.divisible))
+
+    def create_symint(self, name, val):
         if not HAS_SYMPY:
             raise RuntimeError("Need sympy installed to create symbolic shapes")
-        if shape_env is None:
-            shape_env = self.shape_env
+
         # Currently we don't put 0/1 specialization in guards but perhaps we should
         if val == 0 or val == 1:
             return val
         sympy_expr = sympy.Symbol(name, positive=True, integer=True)
         py_sym_int = PySymInt(sympy_expr, self)
-        cpp_sym_int = torch._C.SymIntNode.new_symint(py_sym_int)  # type: ignore[attr-defined]
-        shape_env[sympy_expr] = val
+        cpp_sym_int = torch.SymIntNode.new_symint(py_sym_int)  # type: ignore[attr-defined]
+        self.var_to_val[sympy_expr] = sympy.Integer(val)
         return cpp_sym_int
 
-    def try_constantify(self, expr):
-        # Simplifies assuming that shape vars > 1 (since we cache on 0/1 shape values)
-        new_shape_env = {
-            k: sympy.Symbol(f"shape_{idx}", positive=True, integer=True) + 1
-            for idx, k in enumerate(self.shape_env.keys())
-        }
-        new_expr = expr.subs(new_shape_env)
-        new_expr = new_expr.simplify()
-        if len(list(new_expr.free_symbols)) == 0:
-            return new_expr
-        return None
-
-    def create_shapes_for_args(self, args, shape_env=None):
-        # Takes pytrees and returns a flat list
+    def create_shapes_for_args(self, args):
         arg_cnt = 0
 
         def create_shape(x):
@@ -235,30 +256,149 @@ def create_shape(x):
             if not isinstance(x, torch.Tensor):
                 return x
 
-            out_shape = [self.create_symint(f"s_{arg_cnt}[{idx}]", sz, shape_env) for idx, sz in enumerate(x.shape)]
+            out_shape = [self.create_symint(f"s_{arg_cnt}[{idx}]", sz) for idx, sz in enumerate(x.shape)]
             arg_cnt += 1
             return out_shape
         return list(map(create_shape, pytree.tree_flatten(args)[0]))
 
     def evaluate_guards_for_args(self, *args):
-        env: Dict[Any, Any] = {}
-        _ = self.create_shapes_for_args(args, shape_env=env)
-        return all(guard.subs(env) == value for guard, value in self.guards)
-
-
-    def evaluate_expr(self, expr):
-        const_expr = self.try_constantify(expr)
-        if const_expr is not None:
-            return const_expr
-
-        expr = expr.simplify()
-        concrete_val = expr.subs(self.shape_env)
-
-        # Uncomment this to see what code triggered this guard.
-        # TODO: Save this to the guard representation so you can look
-        # at it later
-        # import traceback
-        # traceback.print_stack()
+        new_env = ShapeEnv()
+        _ = new_env.create_shapes_for_args(args)
+        return all(guard.xreplace(new_env.var_to_val) == value for guard, value, _ in self.guards)
+
+    def get_nontrivial_guards(self):
+        guards = [(self.simplify(guard), val) for guard, val, _ in self.guards]
+        guards = [guard for guard in guards if len(guard[0].free_symbols) > 0]
+        return guards
+
+    def get_shape_groups(self):
+        shape_groups = collections.defaultdict(list)
+        for k, v in self.replacements.items():
+            shape_groups[v].append(k)
+        return shape_groups
+
+    @_lru_cache
+    def _maybe_evaluate_static(self, expr: "sympy.Expr") -> "Optional[sympy.Expr]":
+        """
+        Tries to evaluate expr without introducing guards
+        """
+        # Simplifies assuming that shape vars > 1 (since we cache on 0/1 shape values)
+        symbols = list(expr.free_symbols)
+        new_shape_env = {
+            k: sympy.Symbol(f"shape_{idx}", positive=True, integer=True) + 1
+            for idx, k in enumerate(symbols)
+        }
+        new_expr = expr.xreplace(new_shape_env)
+        new_expr = sympy.expand(new_expr)
+        if len(list(new_expr.free_symbols)) == 0:
+            return new_expr
+        return None
 
-        self.guards.append((expr, concrete_val))
+    @_lru_cache
+    def replace(self, expr: "sympy.Expr") -> "sympy.Expr":
+        replacements = {s: self._find(cast(sympy.Symbol, s)) for s in expr.free_symbols}
+        return sympy.expand(expr.xreplace(replacements))
+
+    @_lru_cache
+    def _update_divisible(self):
+        new_divisible = {}
+        for k in self.divisible:
+            res = self.replace(k)
+            if len(res.free_symbols) > 0:
+                new_divisible[k] = sympy.Integer(0)
+
+        self.divisible = new_divisible
+
+    @_lru_cache
+    def simplify(self, expr: "sympy.Expr") -> "sympy.Expr":
+        expr = self.replace(expr)
+        if expr.has(sympy.Mod):
+            self._update_divisible()
+            expr = expr.xreplace(self.divisible)
+            expr = sympy.expand(expr)
+        return expr
+
+    @lru_cache(256)
+    def size_hint(self, expr: "sympy.Expr"):
+        """
+        Gets a size hint for a given expression from the underlying shapes we had.
+        Does not introduce a guard, so only use this when you can guarantee that
+        your code is still valid for arbitrary shapes (such as optimization decisions)
+        """
+        result_expr = sympy.expand(expr).xreplace(self.var_to_val)
+        assert len(result_expr.free_symbols) == 0, "Size hint has variables we don't have underlying values for"
+        return result_expr
+
+    @_lru_cache
+    def _find(self, a: "sympy.Symbol") -> "sympy.Expr":
+        """
+        Implements a DSU-like algorithm to find the variable that represents a
+        Also handles transitive non-identity replacements.
+
+        a: b + c
+        c: d
+        """
+        if a not in self.replacements:
+            return a
+        res = self.replacements[a]
+        cur_replace = {s: self._find(s) for s in res.free_symbols}
+        self.replacements[a] = self.replacements[a].xreplace(cur_replace)
+        return self.replacements[a]
+
+    def _maybe_guard_eq(self, expr: "sympy.Eq") -> None:
+        """
+        Evaluates the result of an eq call. If true, uses information to
+        simplify shapes (i.e. a == b or a % 5 == 0)
+        """
+        concrete_bool = bool(self.size_hint(expr))
+        if not concrete_bool:
+            return
+        free = list(expr.free_symbols)
+
+        assert len(free) > 0, "The expression should not be static by this point"
+        # In case of really gnarly expression, we don't blow up
+        if len(free) > 5:
+            return
+        free = sorted(free, key=lambda x: (self.size_hint(x), x.name), reverse=True)  # type: ignore[attr-defined]
+        lhs = expr.lhs
+        rhs = expr.rhs
+        try:
+            solutions = sympy.solveset(lhs - rhs, free[0], domain=sympy.S.Integers)
+            if not solutions.is_finite_set:
+                if expr.has(sympy.Mod):
+                    mod_expr = tuple(expr.atoms(sympy.Mod))[0]
+                    solutions = sympy.solveset(lhs - rhs, mod_expr, domain=sympy.S.Integers)
+                    if solutions.is_finite_set and len(solutions) == 1 and tuple(solutions)[0] == 0:
+                        self.divisible[mod_expr] = sympy.Integer(0)
+                return
+
+            if not isinstance(solutions, sympy.FiniteSet):
+                return
+
+            solutions = tuple(solutions)
+            if len(solutions) == 1 and all(t.is_integer for t in sympy.preorder_traversal(solutions[0])):
+                new_var = self._find(solutions[0])
+                self.replacements[cast(sympy.Symbol, free[0])] = new_var
+        except ZeroDivisionError:
+            pass
+
+    @lru_cache(256)
+    def evaluate_expr(self, expr: "sympy.Expr"):
+        """
+        Given an expression, evaluates it, adding guards if necessary
+        """
+        if len(expr.free_symbols) == 0:
+            return expr
+        expr = self.simplify(expr)
+
+        static_expr = self._maybe_evaluate_static(expr)
+        if static_expr is not None:
+            return static_expr
+
+        if isinstance(expr, sympy.Eq):
+            self._maybe_guard_eq(expr)
+        concrete_val = self.size_hint(expr)
+
+        stack = ''.join(traceback.format_stack())
+        self.guards.append((expr, concrete_val, stack))
         return concrete_val
diff --git a/torch/jit/_builtins.py b/torch/jit/_builtins.py
index 2a3fd865adb1..509957371e7d 100644
--- a/torch/jit/_builtins.py
+++ b/torch/jit/_builtins.py
@@ -12,7 +12,7 @@
 
 _builtin_table: Optional[Dict[int, str]] = None
 
-_modules_containing_builtins = (torch, torch._C._nn, torch._C._fft, torch._C._linalg, torch._C._sparse, torch._C._special)  # type: ignore[attr-defined] # noqa: B950
+_modules_containing_builtins = (torch, torch._C._nn, torch._C._fft, torch._C._linalg, torch._C._nested, torch._C._sparse, torch._C._special)  # type: ignore[attr-defined] # noqa: B950
 
 _builtin_ops = [
     # Pairs of (function, op_name)
diff --git a/torch/jit/_fuser.py b/torch/jit/_fuser.py
index dd26a4e24d38..ddab9d99cf2b 100644
--- a/torch/jit/_fuser.py
+++ b/torch/jit/_fuser.py
@@ -26,16 +26,19 @@ def fuser(name):
     * ``fuser0`` - enables only legacy fuser
     * ``fuser1`` - enables only NNC
     * ``fuser2`` - enables only nvFuser
+    * ``fuser3`` - enables oneDNN Graph
     """
     old_cpu_fuse = torch._C._jit_can_fuse_on_cpu()
     old_gpu_fuse = torch._C._jit_can_fuse_on_gpu()
     old_texpr_fuser_state = torch._C._jit_texpr_fuser_enabled()
     old_nvfuser_state = torch._C._jit_nvfuser_enabled()
+    old_llga_state = torch._C._jit_llga_enabled()
     if name == 'fuser0':  # legacy fuser
         torch._C._jit_override_can_fuse_on_cpu(True)
         torch._C._jit_override_can_fuse_on_gpu(True)
         torch._C._jit_set_texpr_fuser_enabled(False)
         torch._C._jit_set_nvfuser_enabled(False)
+        torch._C._jit_set_llga_enabled(False)
     elif name == 'fuser1':  # NNC
         old_profiling_executor = torch._C._jit_set_profiling_executor(True)
         old_profiling_mode = torch._C._get_graph_executor_optimize(True)
@@ -43,22 +46,33 @@ def fuser(name):
         torch._C._jit_override_can_fuse_on_gpu(True)
         torch._C._jit_set_texpr_fuser_enabled(True)
         torch._C._jit_set_nvfuser_enabled(False)
+        torch._C._jit_set_llga_enabled(False)
     elif name == 'fuser2':  # nvFuser
         torch._C._jit_override_can_fuse_on_cpu(False)
         torch._C._jit_override_can_fuse_on_gpu(False)
         torch._C._jit_set_texpr_fuser_enabled(False)
         torch._C._jit_set_nvfuser_enabled(True)
+        torch._C._jit_set_llga_enabled(False)
+    elif name == 'fuser3':  # oneDNN Graph
+        old_profiling_executor = torch._C._jit_set_profiling_executor(True)
+        old_profiling_mode = torch._C._get_graph_executor_optimize(True)
+        torch._C._jit_override_can_fuse_on_cpu(True)
+        torch._C._jit_override_can_fuse_on_gpu(False)
+        torch._C._jit_set_texpr_fuser_enabled(True)
+        torch._C._jit_set_nvfuser_enabled(False)
+        torch._C._jit_set_llga_enabled(True)
     elif name == 'none':  # Turn Pytorch fuser off
         torch._C._jit_override_can_fuse_on_cpu(False)
         torch._C._jit_override_can_fuse_on_gpu(False)
         torch._C._jit_set_texpr_fuser_enabled(False)
         torch._C._jit_set_nvfuser_enabled(False)
+        torch._C._jit_set_llga_enabled(False)
     else:
         raise Exception(f"unrecognized fuser option (name: {name})")
     try:
         yield
     finally:
-        if name == 'fuser1':  # NNC
+        if name in ['fuser1', 'fuser3']:  # NNC or oneDNN Graph
             torch._C._jit_set_profiling_executor(old_profiling_executor)
             torch._C._get_graph_executor_optimize(old_profiling_mode)
         # recover the previous values
@@ -66,6 +80,7 @@ def fuser(name):
         torch._C._jit_override_can_fuse_on_gpu(old_gpu_fuse)
         torch._C._jit_set_texpr_fuser_enabled(old_texpr_fuser_state)
         torch._C._jit_set_nvfuser_enabled(old_nvfuser_state)
+        torch._C._jit_set_llga_enabled(old_llga_state)
 
 
 last_executed_optimized_graph = torch._C._last_executed_optimized_graph
diff --git a/torch/jit/_trace.py b/torch/jit/_trace.py
index 7a658f3285ad..fe0091f63bb6 100644
--- a/torch/jit/_trace.py
+++ b/torch/jit/_trace.py
@@ -498,13 +498,22 @@ def compare_outputs(original, reference, match_what):
                             equal_nan=True,
                         )
                     else:
-                        torch.testing.assert_close(
-                            orig.double(),
-                            ref.double(),
-                            rtol=check_tolerance,
-                            atol=default_tolerances(orig, ref)[1],
-                            equal_nan=True,
-                        )
+                        if orig.is_mps or ref.is_mps:
+                            torch.testing.assert_close(
+                                orig.float(),
+                                ref.float(),
+                                rtol=check_tolerance,
+                                atol=default_tolerances(orig, ref)[1],
+                                equal_nan=True,
+                            )
+                        else:
+                            torch.testing.assert_close(
+                                orig.double(),
+                                ref.double(),
+                                rtol=check_tolerance,
+                                atol=default_tolerances(orig, ref)[1],
+                                equal_nan=True,
+                            )
                 except AssertionError as e:
                     maybe_warn_nondeterministic()
                     warnings.warn(
diff --git a/torch/library.py b/torch/library.py
index eae9bac26453..e97ae5726781 100644
--- a/torch/library.py
+++ b/torch/library.py
@@ -2,6 +2,7 @@
 from typing import Set
 import traceback
 import torch
+import os
 
 __all__ = ['Library', 'impl', 'define']
 
@@ -29,6 +30,9 @@ class Library:
         dispatch_key: PyTorch dispatch key (default: "")
     """
     def __init__(self, ns, kind, dispatch_key=""):
+        if os.environ.get('PYTORCH_DISABLE_LIBRARY', "0") == "1":
+            raise RuntimeError("Trying to use torch.library in an environment where it is disabled")
+
         if kind != "IMPL" and kind != "DEF":
             raise ValueError("Unsupported kind: ", kind)
 
diff --git a/torch/nested/__init__.py b/torch/nested/__init__.py
index e69de29bb2d1..3c888e0a837f 100644
--- a/torch/nested/__init__.py
+++ b/torch/nested/__init__.py
@@ -0,0 +1,64 @@
+import torch
+from torch._C import _add_docstr, _nested  # type: ignore[attr-defined]
+
+__all__ = [
+    'to_padded_tensor'
+]
+
+Tensor = torch.Tensor
+
+# Note: This not only adds doc strings for the nested ops, but
+# also connects the torch.nested Python namespace to the torch._C._nested builtins.
+
+to_padded_tensor = _add_docstr(_nested.nested_to_padded_tensor,
+                               r"""
+to_padded_tensor(input, padding, output_size=None, out=None) -> Tensor
+
+Returns a new (non-nested) Tensor by padding the nested tensor.
+The leading entries will be filled with the nested data,
+while the trailing entries will be padded.
+
+.. warning::
+
+    :func:`to_padded_tensor` always copies the underlying data,
+    since the nested and the non-nested tensors differ in memory layout.
+
+Args:
+    padding (float): The padding value for the trailing entries.
+
+Keyword args:
+    output_size (Tuple[int]): The size of the output tensor.
+                              If given, it must be large enough to contain all nested data;
+                              else, will infer by taking the max size of each nested sub-tensor along each dimension.
+    out (Tensor, optional): the output tensor.
+
+Example::
+
+    >>> nt = torch.nested_tensor([torch.randn((2, 5)), torch.randn((3, 4))])
+    nested_tensor([
+      tensor([[ 1.6862, -1.1282,  1.1031,  0.0464, -1.3276],
+              [-1.9967, -1.0054,  1.8972,  0.9174, -1.4995]]),
+      tensor([[-1.8546, -0.7194, -0.2918, -0.1846],
+              [ 0.2773,  0.8793, -0.5183, -0.6447],
+              [ 1.8009,  1.8468, -0.9832, -1.5272]])
+    ])
+    >>> pt_infer = torch.nested.to_padded_tensor(nt, 0.0)
+    tensor([[[ 1.6862, -1.1282,  1.1031,  0.0464, -1.3276],
+             [-1.9967, -1.0054,  1.8972,  0.9174, -1.4995],
+             [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],
+            [[-1.8546, -0.7194, -0.2918, -0.1846,  0.0000],
+             [ 0.2773,  0.8793, -0.5183, -0.6447,  0.0000],
+             [ 1.8009,  1.8468, -0.9832, -1.5272,  0.0000]]])
+    >>> pt_large = torch.nested.to_padded_tensor(nt, 1.0, (2, 4, 6))
+    tensor([[[ 1.6862, -1.1282,  1.1031,  0.0464, -1.3276,  1.0000],
+             [-1.9967, -1.0054,  1.8972,  0.9174, -1.4995,  1.0000],
+             [ 1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000],
+             [ 1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000]],
+            [[-1.8546, -0.7194, -0.2918, -0.1846,  1.0000,  1.0000],
+             [ 0.2773,  0.8793, -0.5183, -0.6447,  1.0000,  1.0000],
+             [ 1.8009,  1.8468, -0.9832, -1.5272,  1.0000,  1.0000],
+             [ 1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000]]])
+    >>> pt_small = torch.nested.to_padded_tensor(nt, 2.0, (2, 2, 2))
+    RuntimeError: Value in output_size is less than NestedTensor padded size. Truncation is not supported.
+
+""")
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index c816437abdbf..6ca0cf59c7a4 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -654,7 +654,8 @@ class GELU(Module):
     where :math:`\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution.
 
     When the approximate argument is 'tanh', Gelu is estimated with:
-        :math:: \text{GELU}(x) = 0.5 * x * (1 + \text{Tanh}(\sqrt(2 / \pi) * (x + 0.044715 * x^3)))
+
+    .. math:: \text{GELU}(x) = 0.5 * x * (1 + \text{Tanh}(\sqrt(2 / \pi) * (x + 0.044715 * x^3)))
 
     Args:
         approximate (str, optional): the gelu approximation algorithm to use:
@@ -1094,6 +1095,8 @@ def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: O
             why_not_fast_path = "key_padding_mask is not supported with NestedTensor input"
         elif self.num_heads % 2 == 1:
             why_not_fast_path = "num_heads is odd"
+        elif torch.is_autocast_enabled():
+            why_not_fast_path = "autocast is enabled"
 
         if not why_not_fast_path:
             tensor_args = (
diff --git a/torch/nn/modules/transformer.py b/torch/nn/modules/transformer.py
index 8f6ca761d0c7..6ef1b3a5b210 100644
--- a/torch/nn/modules/transformer.py
+++ b/torch/nn/modules/transformer.py
@@ -238,6 +238,8 @@ def forward(self, src: Tensor, mask: Optional[Tensor] = None, src_key_padding_ma
             why_not_sparsity_fast_path = "src_key_padding_mask and mask were both supplied"
         elif first_layer.self_attn.num_heads % 2 == 1:
             why_not_sparsity_fast_path = "num_head is odd"
+        elif torch.is_autocast_enabled():
+            why_not_sparsity_fast_path = "autocast is enabled"
 
         if not why_not_sparsity_fast_path:
             tensor_args = (
@@ -280,7 +282,7 @@ def forward(self, src: Tensor, mask: Optional[Tensor] = None, src_key_padding_ma
             output = mod(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask_for_layers)
 
         if convert_to_nested:
-            output = output.to_padded_tensor(0.)
+            output = torch.nested.to_padded_tensor(output, 0.)
 
         if self.norm is not None:
             output = self.norm(output)
@@ -467,6 +469,8 @@ def forward(self, src: Tensor, src_mask: Optional[Tensor] = None,
             why_not_sparsity_fast_path = "src_key_padding_mask is not supported with NestedTensor input for fastpath"
         elif self.self_attn.num_heads % 2 == 1:
             why_not_sparsity_fast_path = "num_head is odd"
+        elif torch.is_autocast_enabled():
+            why_not_sparsity_fast_path = "autocast is enabled"
 
         if not why_not_sparsity_fast_path:
             tensor_args = (
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index dcaccf38a672..e71884b603ae 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -526,6 +526,9 @@ class DistributedDataParallel(Module, Joinable):
         >>> net = torch.nn.parallel.DistributedDataParallel(model)
     """
 
+    # used to track whether the given thread is inside ddp forward for torchdynamo purposes
+    _active_ddp_module = None
+
     def __init__(
         self,
         module,
@@ -963,6 +966,26 @@ def no_sync(self):
         finally:
             self.require_backward_grad_sync = old_require_backward_grad_sync
 
+    @classmethod
+    def _get_active_ddp_module(cls):
+        """
+        TorchDynamo needs to know whether DDP is currently active, and access the DDP module in order to cooperatively optimize it.
+        """
+        return cls._active_ddp_module
+
+    # note, this ctxmgr function is marked 'skip' in torchdynamo, so dynamo only kicks in
+    # for the 'module_to_run' underneath
+    # see torchdynamo/eval_frame.py TorchPatcher.patch for more details
+    @contextmanager
+    def _inside_ddp_forward(self):
+        DistributedDataParallel._active_ddp_module = self
+        try:
+            yield
+        except Exception:
+            raise
+        finally:
+            DistributedDataParallel._active_ddp_module = None
+
     def _run_ddp_forward(self, *inputs, **kwargs):
         module_to_run = self._replicated_tensor_module if self._use_replicated_tensor_module else self.module
 
@@ -973,9 +996,11 @@ def _run_ddp_forward(self, *inputs, **kwargs):
                 self.device_ids[0],
                 self.use_side_stream_for_tensor_copies
             )
-            return module_to_run(*inputs[0], **kwargs[0])
+            with self._inside_ddp_forward():
+                return module_to_run(*inputs[0], **kwargs[0])
         else:
-            return module_to_run(*inputs, **kwargs)
+            with self._inside_ddp_forward():
+                return module_to_run(*inputs, **kwargs)
 
     def forward(self, *inputs, **kwargs):
         with torch.autograd.profiler.record_function("DistributedDataParallel.forward"):
diff --git a/torch/nn/quantized/_reference/modules/__init__.py b/torch/nn/quantized/_reference/modules/__init__.py
index 2541c22ccf59..2e7098f9337d 100644
--- a/torch/nn/quantized/_reference/modules/__init__.py
+++ b/torch/nn/quantized/_reference/modules/__init__.py
@@ -2,17 +2,17 @@
 r"""Quantized Reference Modules
 
 This module is in the process of migration to
-`torch/ao/nn/quantized/_reference`, and is kept here for
+`torch/ao/nn/quantized/reference`, and is kept here for
 compatibility while the migration process is ongoing.
 If you are adding a new entry/functionality, please, add it to the
-appropriate file under the `torch/ao/nn/quantized/_reference`,
+appropriate file under the `torch/ao/nn/quantized/reference`,
 while adding an import statement here.
 """
 
-from torch.ao.nn.quantized._reference.modules.linear import Linear
-from torch.ao.nn.quantized._reference.modules.conv import Conv1d, Conv2d, Conv3d, ConvTranspose1d, ConvTranspose2d, ConvTranspose3d
-from torch.ao.nn.quantized._reference.modules.rnn import RNNCell, LSTMCell, GRUCell, LSTM
-from torch.ao.nn.quantized._reference.modules.sparse import Embedding, EmbeddingBag
+from torch.ao.nn.quantized.reference.modules.linear import Linear
+from torch.ao.nn.quantized.reference.modules.conv import Conv1d, Conv2d, Conv3d, ConvTranspose1d, ConvTranspose2d, ConvTranspose3d
+from torch.ao.nn.quantized.reference.modules.rnn import RNNCell, LSTMCell, GRUCell, LSTM
+from torch.ao.nn.quantized.reference.modules.sparse import Embedding, EmbeddingBag
 
 __all__ = [
     'Linear',
diff --git a/torch/nn/quantized/_reference/modules/conv.py b/torch/nn/quantized/_reference/modules/conv.py
index 12ce9344d12c..2fc8167556bb 100644
--- a/torch/nn/quantized/_reference/modules/conv.py
+++ b/torch/nn/quantized/_reference/modules/conv.py
@@ -2,18 +2,18 @@
 r"""Quantized Reference Modules
 
 This module is in the process of migration to
-`torch/ao/nn/quantized/_reference`, and is kept here for
+`torch/ao/nn/quantized/reference`, and is kept here for
 compatibility while the migration process is ongoing.
 If you are adding a new entry/functionality, please, add it to the
-appropriate file under the `torch/ao/nn/quantized/_reference`,
+appropriate file under the `torch/ao/nn/quantized/reference`,
 while adding an import statement here.
 """
 
-from torch.ao.nn.quantized._reference.modules.conv import _ConvNd
-from torch.ao.nn.quantized._reference.modules.conv import Conv1d
-from torch.ao.nn.quantized._reference.modules.conv import Conv2d
-from torch.ao.nn.quantized._reference.modules.conv import Conv3d
-from torch.ao.nn.quantized._reference.modules.conv import _ConvTransposeNd
-from torch.ao.nn.quantized._reference.modules.conv import ConvTranspose1d
-from torch.ao.nn.quantized._reference.modules.conv import ConvTranspose2d
-from torch.ao.nn.quantized._reference.modules.conv import ConvTranspose3d
+from torch.ao.nn.quantized.reference.modules.conv import _ConvNd
+from torch.ao.nn.quantized.reference.modules.conv import Conv1d
+from torch.ao.nn.quantized.reference.modules.conv import Conv2d
+from torch.ao.nn.quantized.reference.modules.conv import Conv3d
+from torch.ao.nn.quantized.reference.modules.conv import _ConvTransposeNd
+from torch.ao.nn.quantized.reference.modules.conv import ConvTranspose1d
+from torch.ao.nn.quantized.reference.modules.conv import ConvTranspose2d
+from torch.ao.nn.quantized.reference.modules.conv import ConvTranspose3d
diff --git a/torch/nn/quantized/_reference/modules/linear.py b/torch/nn/quantized/_reference/modules/linear.py
index 5b9292287deb..719e07480b19 100644
--- a/torch/nn/quantized/_reference/modules/linear.py
+++ b/torch/nn/quantized/_reference/modules/linear.py
@@ -2,11 +2,11 @@
 r"""Quantized Reference Modules
 
 This module is in the process of migration to
-`torch/ao/nn/quantized/_reference`, and is kept here for
+`torch/ao/nn/quantized/reference`, and is kept here for
 compatibility while the migration process is ongoing.
 If you are adding a new entry/functionality, please, add it to the
-appropriate file under the `torch/ao/nn/quantized/_reference`,
+appropriate file under the `torch/ao/nn/quantized/reference`,
 while adding an import statement here.
 """
 
-from torch.ao.nn.quantized._reference.modules.linear import Linear
+from torch.ao.nn.quantized.reference.modules.linear import Linear
diff --git a/torch/nn/quantized/_reference/modules/rnn.py b/torch/nn/quantized/_reference/modules/rnn.py
index 3096d41bad5b..82bf37352def 100644
--- a/torch/nn/quantized/_reference/modules/rnn.py
+++ b/torch/nn/quantized/_reference/modules/rnn.py
@@ -2,16 +2,16 @@
 r"""Quantized Reference Modules
 
 This module is in the process of migration to
-`torch/ao/nn/quantized/_reference`, and is kept here for
+`torch/ao/nn/quantized/reference`, and is kept here for
 compatibility while the migration process is ongoing.
 If you are adding a new entry/functionality, please, add it to the
-appropriate file under the `torch/ao/nn/quantized/_reference`,
+appropriate file under the `torch/ao/nn/quantized/reference`,
 while adding an import statement here.
 """
 
-from torch.ao.nn.quantized._reference.modules.rnn import RNNCellBase
-from torch.ao.nn.quantized._reference.modules.rnn import RNNCell
-from torch.ao.nn.quantized._reference.modules.rnn import LSTMCell
-from torch.ao.nn.quantized._reference.modules.rnn import GRUCell
-from torch.ao.nn.quantized._reference.modules.rnn import RNNBase
-from torch.ao.nn.quantized._reference.modules.rnn import LSTM
+from torch.ao.nn.quantized.reference.modules.rnn import RNNCellBase
+from torch.ao.nn.quantized.reference.modules.rnn import RNNCell
+from torch.ao.nn.quantized.reference.modules.rnn import LSTMCell
+from torch.ao.nn.quantized.reference.modules.rnn import GRUCell
+from torch.ao.nn.quantized.reference.modules.rnn import RNNBase
+from torch.ao.nn.quantized.reference.modules.rnn import LSTM
diff --git a/torch/nn/quantized/_reference/modules/sparse.py b/torch/nn/quantized/_reference/modules/sparse.py
index b1ea97ca9658..2230bdee344d 100644
--- a/torch/nn/quantized/_reference/modules/sparse.py
+++ b/torch/nn/quantized/_reference/modules/sparse.py
@@ -2,12 +2,12 @@
 r"""Quantized Reference Modules
 
 This module is in the process of migration to
-`torch/ao/nn/quantized/_reference`, and is kept here for
+`torch/ao/nn/quantized/reference`, and is kept here for
 compatibility while the migration process is ongoing.
 If you are adding a new entry/functionality, please, add it to the
-appropriate file under the `torch/ao/nn/quantized/_reference`,
+appropriate file under the `torch/ao/nn/quantized/reference`,
 while adding an import statement here.
 """
 
-from torch.ao.nn.quantized._reference.modules.sparse import Embedding
-from torch.ao.nn.quantized._reference.modules.sparse import EmbeddingBag
+from torch.ao.nn.quantized.reference.modules.sparse import Embedding
+from torch.ao.nn.quantized.reference.modules.sparse import EmbeddingBag
diff --git a/torch/nn/quantized/_reference/modules/utils.py b/torch/nn/quantized/_reference/modules/utils.py
index bfce9f857fda..e3371a7e3a1f 100644
--- a/torch/nn/quantized/_reference/modules/utils.py
+++ b/torch/nn/quantized/_reference/modules/utils.py
@@ -2,14 +2,14 @@
 r"""Quantized Reference Modules
 
 This module is in the process of migration to
-`torch/ao/nn/quantized/_reference`, and is kept here for
+`torch/ao/nn/quantized/reference`, and is kept here for
 compatibility while the migration process is ongoing.
 If you are adding a new entry/functionality, please, add it to the
-appropriate file under the `torch/ao/nn/quantized/_reference`,
+appropriate file under the `torch/ao/nn/quantized/reference`,
 while adding an import statement here.
 """
-from torch.ao.nn.quantized._reference.modules.utils import _quantize_weight
-from torch.ao.nn.quantized._reference.modules.utils import _quantize_and_dequantize_weight
-from torch.ao.nn.quantized._reference.modules.utils import _save_weight_qparams
-from torch.ao.nn.quantized._reference.modules.utils import _get_weight_qparam_keys
-from torch.ao.nn.quantized._reference.modules.utils import ReferenceQuantizedModule
+from torch.ao.nn.quantized.reference.modules.utils import _quantize_weight
+from torch.ao.nn.quantized.reference.modules.utils import _quantize_and_dequantize_weight
+from torch.ao.nn.quantized.reference.modules.utils import _save_weight_qparams
+from torch.ao.nn.quantized.reference.modules.utils import _get_weight_qparam_keys
+from torch.ao.nn.quantized.reference.modules.utils import ReferenceQuantizedModule
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index 27931eca2993..351fd342cfc9 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -9,6 +9,7 @@
     TensorProtoDataType,
     TrainingMode,
 )
+from torch.onnx._internal import registration as _registration
 
 from . import (  # usort:skip. Keep the order instead of sorting lexicographically
     _deprecation,
@@ -25,7 +26,6 @@
     symbolic_opset14,
     symbolic_opset15,
     symbolic_opset16,
-    symbolic_registry,
     utils,
 )
 from ._exporter_states import ExportTypes, SymbolicContext
@@ -46,7 +46,6 @@
 __all__ = [
     # Modules
     "symbolic_helper",
-    "symbolic_registry",
     "utils",
     "errors",
     # All opsets
@@ -134,3 +133,6 @@ def log(*args) -> None:
             character appended to the end, and flushed to output stream.
     """
     _C._jit_onnx_log(*args)
+
+
+_registration.discover_and_register_all_symbolic_opsets()
diff --git a/torch/onnx/_constants.py b/torch/onnx/_constants.py
index 3741b2e3d7a1..8b71a4f86c17 100644
--- a/torch/onnx/_constants.py
+++ b/torch/onnx/_constants.py
@@ -1,9 +1,12 @@
 """Constant values used in ONNX."""
 
 ONNX_ARCHIVE_MODEL_PROTO_NAME = "__MODEL_PROTO"
-onnx_default_opset = 14
-onnx_main_opset = 17
-onnx_stable_opsets = tuple(range(7, onnx_main_opset))
-onnx_constant_folding_opsets = tuple(range(9, onnx_main_opset + 1))
+
+ONNX_BASE_OPSET = 9
+ONNX_MIN_OPSET = 7
+ONNX_MAX_OPSET = 17
+# ONNX_DEFAULT_OPSET generated by tools/onnx/update_default_opset_version.py
+ONNX_DEFAULT_OPSET = 14
+ONNX_CONSTANT_FOLDING_MIN_OPSET = 9
 
 PYTORCH_GITHUB_ISSUES_URL = "https://github.com/pytorch/pytorch/issues"
diff --git a/torch/onnx/_globals.py b/torch/onnx/_globals.py
index 61a9ce20af6c..558c70a738c5 100644
--- a/torch/onnx/_globals.py
+++ b/torch/onnx/_globals.py
@@ -23,7 +23,7 @@ class _InternalGlobals:
     """
 
     def __init__(self):
-        self._export_onnx_opset_version = _constants.onnx_default_opset
+        self._export_onnx_opset_version = _constants.ONNX_DEFAULT_OPSET
         self._training_mode: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL
         self._in_onnx_export: bool = False
         # Whether the user's model is training during export
@@ -66,8 +66,9 @@ def export_onnx_opset_version(self) -> int:
 
     @export_onnx_opset_version.setter
     def export_onnx_opset_version(self, value: int):
-        supported_versions = [_constants.onnx_main_opset]
-        supported_versions.extend(_constants.onnx_stable_opsets)
+        supported_versions = range(
+            _constants.ONNX_MIN_OPSET, _constants.ONNX_MAX_OPSET + 1
+        )
         if value not in supported_versions:
             raise ValueError(f"Unsupported ONNX opset version: {value}")
         self._export_onnx_opset_version = value
diff --git a/torch/onnx/_internal/registration.py b/torch/onnx/_internal/registration.py
new file mode 100644
index 000000000000..03c0e192c1c2
--- /dev/null
+++ b/torch/onnx/_internal/registration.py
@@ -0,0 +1,383 @@
+"""Module for handling symbolic function registration."""
+
+import importlib
+import inspect
+import warnings
+from typing import (
+    Callable,
+    Collection,
+    Dict,
+    Generic,
+    Optional,
+    Sequence,
+    Set,
+    TypeVar,
+    Union,
+)
+
+from torch.onnx import _constants, errors
+from torch.onnx._internal import _beartype
+
+OpsetVersion = int
+
+
+def _dispatch_opset_version(
+    target: OpsetVersion, registered_opsets: Collection[OpsetVersion]
+) -> Optional[OpsetVersion]:
+    """Finds the registered opset given a target opset version and the available opsets.
+
+    Args:
+        target: The target opset version.
+        available_opsets: The available opsets.
+
+    Returns:
+        The registered opset version.
+    """
+    if not registered_opsets:
+        return None
+    registered_versions = sorted(registered_opsets)
+    # Linear search for the opset version, which is fine since the number of opset
+    # versions is small.
+
+    # Always round toward opset 9 (ONNX_BASE_OPSET).
+    # Count down until opset 9 is reached.
+    for version in reversed(registered_versions):
+        if _constants.ONNX_BASE_OPSET <= version <= target:
+            return version
+
+    for version in registered_versions:
+        # Count back up until _constants.ONNX_BASE_OPSET
+        if target <= version <= _constants.ONNX_BASE_OPSET:
+            return version
+
+    assert (
+        not registered_versions
+        or _constants.ONNX_BASE_OPSET <= target < registered_versions[0]
+        or registered_versions[-1] < _constants.ONNX_BASE_OPSET <= target
+    )
+    return None
+
+
+_K = TypeVar("_K")
+_V = TypeVar("_V")
+
+
+class OverrideDict(Generic[_K, _V], Collection[_K]):
+    """A dictionary that merges built-in and custom symbolic functions.
+
+    It supports overriding and un-overriding built-in symbolic functions with custom
+    ones.
+    """
+
+    def __init__(self):
+        self._base: Dict[_K, _V] = {}
+        self._overrides: Dict[_K, _V] = {}
+        self._merged: Dict[_K, _V] = {}
+
+    def set_base(self, key: _K, value: _V) -> None:
+        self._base[key] = value
+        if key not in self._overrides:
+            self._merged[key] = value
+
+    def in_base(self, key: _K) -> bool:
+        """Checks if a key is in the base dictionary."""
+        return key in self._base
+
+    def override(self, key: _K, value: _V) -> None:
+        """Overrides a base key-value with a new pair."""
+        self._overrides[key] = value
+        self._merged[key] = value
+
+    def remove_override(self, key: _K) -> None:
+        """Un-overrides a key-value pair."""
+        self._overrides.pop(key, None)  # type: ignore[arg-type]
+        self._merged.pop(key, None)  # type: ignore[arg-type]
+        if key in self._base:
+            self._merged[key] = self._base[key]
+
+    def overridden(self, key: _K) -> bool:
+        """Checks if a key-value pair is overridden."""
+        return key in self._overrides
+
+    def __getitem__(self, key: _K) -> _V:
+        return self._merged[key]
+
+    def get(self, key: _K, default: Optional[_V] = None):
+        return self._merged.get(key, default)
+
+    def __contains__(self, key: object) -> bool:
+        return key in self._merged
+
+    def __iter__(self):
+        return iter(self._merged)
+
+    def __len__(self) -> int:
+        return len(self._merged)
+
+    def __repr__(self) -> str:
+        return f"OverrideDict(base={self._base}, overrides={self._overrides})"
+
+    def __bool__(self) -> bool:
+        return bool(self._merged)
+
+
+class _SymbolicFunctionGroup:
+    """Different versions of symbolic functions registered to the same name.
+
+    O(number of registered versions of an op) search is performed to find the most
+    recent version of the op.
+
+    The registration is delayed until op is used to improve startup time.
+
+    Function overloads with different arguments are not allowed.
+    Custom op overrides are supported.
+    """
+
+    def __init__(self, name: str) -> None:
+        self._name = name
+        # A dictionary of functions, keyed by the opset version.
+        self._functions: OverrideDict[OpsetVersion, Callable] = OverrideDict()
+
+    def __repr__(self) -> str:
+        return f"_SymbolicFunctionGroup({self._name}, registered={self._functions})"
+
+    def __getitem__(self, key: OpsetVersion) -> Callable:
+        result = self.get(key)
+        if result is None:
+            raise KeyError(key)
+        return result
+
+    # TODO(justinchuby): Add @functools.lru_cache(maxsize=None) if lookup time becomes
+    # a problem.
+    def get(self, opset: OpsetVersion) -> Optional[Callable]:
+        """Find the most recent version of the function."""
+        version = _dispatch_opset_version(opset, self._functions)
+        if version is None:
+            return None
+
+        return self._functions[version]
+
+    def add(self, func: Callable, opset: OpsetVersion) -> None:
+        """Adds a symbolic function.
+
+        Args:
+            func: The function to add.
+            opset: The opset version of the function to add.
+        """
+        if self._functions.in_base(opset):
+            warnings.warn(
+                f"Symbolic function '{self._name}' already registered for opset {opset}. "
+                f"Replacing the existing function with new function. This is unexpected. "
+                f"Please report it on {_constants.PYTORCH_GITHUB_ISSUES_URL}.",
+                errors.OnnxExporterWarning,
+            )
+        self._functions.set_base(opset, func)
+
+    def add_custom(self, func: Callable, opset: OpsetVersion) -> None:
+        """Adds a custom symbolic function.
+
+        Args:
+            func: The symbolic function to register.
+            opset: The corresponding opset version.
+        """
+        self._functions.override(opset, func)
+
+    def remove_custom(self, opset: OpsetVersion) -> None:
+        """Removes a custom symbolic function.
+
+        Args:
+            opset: The opset version of the custom function to remove.
+        """
+        if not self._functions.overridden(opset):
+            warnings.warn(
+                f"No custom function registered for '{self._name}' opset {opset}"
+            )
+            return
+        self._functions.remove_override(opset)
+
+    def get_min_supported(self) -> OpsetVersion:
+        """Returns the lowest built-in opset version supported by the function."""
+        return min(self._functions)
+
+
+class SymbolicRegistry:
+    """Registry for symbolic functions.
+
+    The registry maintains a mapping from qualified names to symbolic functions.
+    It is used to register new symbolic functions and to dispatch calls to
+    the appropriate function.
+    """
+
+    def __init__(self) -> None:
+        self._registry: Dict[str, _SymbolicFunctionGroup] = {}
+
+    def register(
+        self, name: str, opset: OpsetVersion, func: Callable, custom: bool = False
+    ) -> None:
+        """Registers a symbolic function.
+
+        Args:
+            name: The qualified name of the function to register. In the form of 'domain::op'.
+                E.g. 'aten::add'.
+            opset: The opset version of the function to register.
+            func: The symbolic function to register.
+            custom: Whether the function is a custom function that overrides existing ones.
+
+        Raises:
+            ValueError: If the separator '::' is not in the name.
+        """
+        if "::" not in name:
+            raise ValueError(
+                f"The name must be in the form of 'domain::op', not '{name}'"
+            )
+        symbolic_functions = self._registry.setdefault(
+            name, _SymbolicFunctionGroup(name)
+        )
+        if custom:
+            symbolic_functions.add_custom(func, opset)
+        else:
+            symbolic_functions.add(func, opset)
+
+    def unregister(self, name: str, opset: OpsetVersion) -> None:
+        """Unregisters a symbolic function.
+
+        Args:
+            name: The qualified name of the function to unregister.
+            opset: The opset version of the function to unregister.
+        """
+        if name not in self._registry:
+            return
+        self._registry[name].remove_custom(opset)
+
+    def get_function_group(self, name: str) -> Optional[_SymbolicFunctionGroup]:
+        """Returns the function group for the given name."""
+        return self._registry.get(name)
+
+    def is_registered_op(self, name: str, version: int) -> bool:
+        """Returns whether the given op is registered for the given opset version."""
+        functions = self.get_function_group(name)
+        if functions is None:
+            return False
+        return functions.get(version) is not None
+
+    def all_functions(self) -> Set[str]:
+        """Returns the set of all registered function names."""
+        return set(self._registry)
+
+
+def discover_and_register_all_symbolic_opsets() -> None:
+    """Discover all symbolic functions.
+    Opset 9 is the base version. It is selected as the base version because
+        1. It is the first opset version supported by PyTorch export.
+        2. opset 9 is more robust than previous opset versions. Opset versions like 7/8 have limitations
+            that certain basic operators cannot be expressed in ONNX. Instead of basing on these limitations,
+            we chose to handle them as special cases separately.
+
+    Backward support for opset versions beyond opset 7 is not in our roadmap.
+    For opset versions other than 9, by default they will inherit the symbolic functions defined in
+    symbolic_opset9.py.
+
+    To extend support for updated operators in different opset versions on top of opset 9,
+    simply add the updated symbolic functions in the respective symbolic_opset{version}.py file.
+    Checkout topk in symbolic_opset10.py, and upsample_nearest2d in symbolic_opset8.py for example.
+    """
+    for opset in range(_constants.ONNX_MIN_OPSET, _constants.ONNX_MAX_OPSET + 1):
+        module = importlib.import_module(f"torch.onnx.symbolic_opset{opset}")
+        _register_module(module, opset)
+
+
+def _register_module(module, opset: OpsetVersion) -> None:
+    """Registers all functions in the given module.
+
+    Args:
+        module: The module to register.
+        opset: The opset version to register.
+    """
+    global registry
+    members = inspect.getmembers(module)
+    for name, obj in members:
+        if isinstance(obj, type) and hasattr(obj, "domain"):
+            # Symbolic functions in domains other than aten
+            ops = inspect.getmembers(obj, predicate=inspect.isfunction)
+            for op in ops:
+                registry.register(f"{obj.domain}::{op[0]}", opset, op[1])  # type: ignore[attr-defined]
+
+        elif inspect.isfunction(obj):
+            if name in {"_len", "_list", "_any", "_all"}:
+                name = name[1:]
+            registry.register(f"aten::{name}", opset, obj)
+
+
+@_beartype.beartype
+def onnx_symbolic(
+    name: str,
+    opset: Union[OpsetVersion, Sequence[OpsetVersion]],
+    decorate: Optional[Sequence[Callable]] = None,
+    custom: bool = False,
+) -> Callable:
+    """Registers a symbolic function.
+
+    Usage::
+
+    ```
+    @onnx_symbolic("aten::symbolic_b", opset=10, decorate=[quantized_aten_handler(scale=1/128, zero_point=0)])
+    @symbolic_helper.parse_args("v", "v", "b")
+    def symbolic_b(g: _C.Graph, x: _C.Value, y: _C.Value, arg1: bool) -> _C.Value:
+        ...
+    ```
+
+    Args:
+        name: The qualified name of the function in the form of 'domain::op'.
+            E.g. 'aten::add'.
+        opset: The opset versions of the function to register at.
+        decorate: A sequence of decorators to apply to the function.
+        custom: Whether the function is a custom symbolic function.
+
+    Raises:
+        ValueError: If the separator '::' is not in the name.
+    """
+
+    def wrapper(func: Callable) -> Callable:
+        decorated = func
+        if decorate is not None:
+            for decorate_func in decorate:
+                decorated = decorate_func(decorated)
+
+        global registry
+        nonlocal opset
+        if isinstance(opset, OpsetVersion):
+            opset = (opset,)
+        for opset_version in opset:
+            registry.register(name, opset_version, decorated, custom=custom)
+
+        # Return the original function because the decorators in "decorate" are only
+        # specific to the instance being registered.
+        return func
+
+    return wrapper
+
+
+@_beartype.beartype
+def custom_onnx_symbolic(
+    name: str,
+    opset: Union[OpsetVersion, Sequence[OpsetVersion]],
+    decorate: Optional[Sequence[Callable]] = None,
+) -> Callable:
+    """Registers a custom symbolic function.
+
+    Args:
+        name: the qualified name of the function.
+        opset: the opset version of the function.
+        decorate: a sequence of decorators to apply to the function.
+
+    Returns:
+        The decorator.
+
+    Raises:
+        ValueError: If the separator '::' is not in the name.
+    """
+    return onnx_symbolic(name, opset, decorate, custom=True)
+
+
+# The registry for all symbolic functions.
+registry = SymbolicRegistry()
diff --git a/torch/onnx/_onnx_supported_ops.py b/torch/onnx/_onnx_supported_ops.py
index e37530cad648..2611b0d81e9b 100644
--- a/torch/onnx/_onnx_supported_ops.py
+++ b/torch/onnx/_onnx_supported_ops.py
@@ -2,11 +2,8 @@
 from typing import Dict, List, Union
 
 from torch import _C
-from torch.onnx import _constants, symbolic_registry
-
-for v in _constants.onnx_stable_opsets:
-    symbolic_registry.register_version("", v)
-symbolic_registry.register_version("", _constants.onnx_main_opset)
+from torch.onnx import _constants
+from torch.onnx._internal import registration
 
 
 class _TorchSchema:
@@ -27,13 +24,15 @@ def __init__(self, schema: Union[_C.FunctionSchema, str]) -> None:
             self.opsets = []
 
     def __str__(self) -> str:
-        s = f"{self.name}.{self.overload_name}("
-        s += ", ".join(self.arguments)
-        s += ") -> ("
-        s += ", ".join(self.returns)
-        s += ")"
-        s += " in opsets "
-        s += ", ".join(str(opset) for opset in self.opsets)
+        s = (
+            f"{self.name}.{self.overload_name}("
+            + ", ".join(self.arguments)
+            + ") -> ("
+            + ", ".join(self.returns)
+            + ")"
+            + " in opsets "
+            + ", ".join(str(opset) for opset in self.opsets)
+        )
         return s
 
     def __hash__(self):
@@ -53,14 +52,6 @@ def is_backward(self) -> bool:
         return "backward" in self.name
 
 
-def _all_aten_forward_schemas():
-    """Creates a list of _TorchSchema for all aten schemas."""
-    torch_schemas = [_TorchSchema(s) for s in _C._jit_get_all_schemas()]
-    torch_schemas = sorted(torch_schemas, key=lambda x: x.name)
-    aten_schemas = [s for s in torch_schemas if s.is_aten() and not s.is_backward()]
-    return aten_schemas
-
-
 def _symbolic_argument_count(func):
     params = []
     signature = inspect.signature(func)
@@ -75,32 +66,32 @@ def _symbolic_argument_count(func):
     return params
 
 
-def _all_symbolics_schemas():
-    symbolics_schemas: Dict[str, _TorchSchema] = {}
-
-    for domain, version in symbolic_registry._registry:
-        for opname, sym_func in symbolic_registry._registry[(domain, version)].items():
-            symbolics_schema = _TorchSchema("aten::" + opname)
-            symbolics_schema.arguments = _symbolic_argument_count(sym_func)
-            if opname in symbolics_schemas:
-                symbolics_schemas[opname].opsets.append(version)
-            else:
-                symbolics_schema.opsets = [version]
-                symbolics_schemas[opname] = symbolics_schema
-    return symbolics_schemas
+def all_forward_schemas() -> Dict[str, _TorchSchema]:
+    """Returns schemas for all TorchScript forward ops."""
+    torch_schemas = [_TorchSchema(s) for s in _C._jit_get_all_schemas()]
+    return {schema.name: schema for schema in torch_schemas if not schema.is_backward()}
+
+
+def all_symbolics_schemas() -> Dict[str, _TorchSchema]:
+    """Returns schemas for all onnx supported ops."""
+    symbolics_schemas = {}
+
+    for name in registration.registry.all_functions():
+        func_group = registration.registry.get_function_group(name)
+        assert func_group is not None
+        symbolics_schema = _TorchSchema(name)
+        func = func_group.get(_constants.ONNX_MAX_OPSET)
+        if func is not None:
+            symbolics_schema.arguments = _symbolic_argument_count(func)
+            symbolics_schema.opsets = list(
+                range(func_group.get_min_supported(), _constants.ONNX_MAX_OPSET + 1)
+            )
+        else:
+            # Only support opset < 9
+            func = func_group.get(7)
+            symbolics_schema.arguments = _symbolic_argument_count(func)
+            symbolics_schema.opsets = list(range(7, _constants.ONNX_BASE_OPSET))
 
+        symbolics_schemas[name] = symbolics_schema
 
-def onnx_supported_ops():
-    aten_schemas = _all_aten_forward_schemas()
-    symbolic_schemas = _all_symbolics_schemas()
-    torch_schemas = set(symbolic_schemas.values())
-    supported_ops = []
-    onnx_supported = []
-    for schema in aten_schemas:
-        if schema in torch_schemas:
-            opname = schema.name[6:]  # without "aten::" prefix
-            opsets = symbolic_schemas[opname].opsets
-            if schema not in supported_ops:
-                supported_ops.append(symbolic_schemas[opname])
-                onnx_supported.append((opname, " ".join(str(o) for o in opsets)))
-    return sorted(onnx_supported, key=lambda x: x[0])
+    return symbolics_schemas
diff --git a/torch/onnx/_patch_torch.py b/torch/onnx/_patch_torch.py
index ef4d568c8613..e55b2134aab0 100644
--- a/torch/onnx/_patch_torch.py
+++ b/torch/onnx/_patch_torch.py
@@ -101,17 +101,18 @@ def _aten_op(g: _C.Graph, operator: str, *args, overload_name: str = "", **kwarg
 
 
 @_beartype.beartype
-def _block_op(b: _C.Block, opname: str, *args: _C.Value, **kwargs):
+def _block_op(block: _C.Block, opname: str, *args: _C.Value, **kwargs):
     if "::" in opname:
-        aten = False
-        ns_opname = opname
+        namespace, op = opname.split("::")
     else:
-        aten = kwargs.pop("aten", False)
-        ns = "aten" if aten else "onnx"
-        ns_opname = ns + "::" + opname
-    n = b.addNode(ns_opname, args)
+        namespace = "onnx"
+        op = opname
+
+    n = block.addNode(f"{namespace}::{op}", args)
+    aten = namespace == "aten"
+    skip_attrs = {"inplace", "aten"}
     for k, v in sorted(kwargs.items()):
-        if k == "inplace":
+        if k in skip_attrs:
             continue
         _add_attribute(n, k, v, aten=aten)
     outputs = tuple(n.outputs())
@@ -135,10 +136,11 @@ def _new_node(
     Returns:
         The new node.
     """
-    aten = kwargs.pop("aten", False)
+    aten = namespace == "aten"
     node = g.create(f"{namespace}::{op}", args, outputs)
+    skip_attrs = {"inplace", "aten"}
     for k, v in sorted(kwargs.items()):
-        if k == "inplace":
+        if k in skip_attrs:
             continue
         _add_attribute(node, k, v, aten=aten)
     return node
@@ -175,7 +177,7 @@ def _add_attribute(node: _C.Node, key: str, value: Any, aten: bool):
     if m is None:
         raise ValueError(
             f"Invalid attribute specifier '{key}' names "
-            " must be suffixed with type, e.g. 'dim_i' or 'dims_i'"
+            "must be suffixed with type, e.g. 'dim_i' or 'dims_i'"
         )
     name, kind = m.group(1), m.group(2)
     if _is_onnx_list(value):
diff --git a/torch/onnx/symbolic_caffe2.py b/torch/onnx/symbolic_caffe2.py
index 6d1be80a1f86..cb1c48a580e9 100644
--- a/torch/onnx/symbolic_caffe2.py
+++ b/torch/onnx/symbolic_caffe2.py
@@ -1,38 +1,38 @@
 import importlib
 import inspect
 
-from torch.onnx import symbolic_helper, symbolic_opset9 as opset9, symbolic_registry
+from torch.onnx import symbolic_helper, symbolic_opset9 as opset9
+from torch.onnx._internal import registration
 
 
 def register_quantized_ops(domain: str, version: int):
-    # Register all the non-quantized ops
-    symbolic_registry.register_version("", version)
     # Register all quantized ops
     module = importlib.import_module("torch.onnx.symbolic_caffe2")
-    symbolic_registry._symbolic_versions["caffe2"] = module
-    quant_version_ops = inspect.getmembers(
-        symbolic_registry._symbolic_versions["caffe2"]
-    )
-    for op in quant_version_ops:
-        if inspect.isfunction(op[1]) and not symbolic_registry.is_registered_op(
-            op[0], domain, version
+    quant_version_ops = inspect.getmembers(module)
+    aten_q_ops = {
+        "relu",
+        "_empty_affine_quantized",
+        "dequantize",
+        "quantize_per_tensor",
+        "upsample_nearest2d",
+        "avg_pool2d",
+        "reshape",
+        "slice",
+        "cat",
+        "max_pool2d",
+        "sigmoid",
+    }
+    for op, func in quant_version_ops:
+        name = f"{domain}::{op}"
+        if inspect.isfunction(func) and not registration.registry.is_registered_op(
+            name, version
         ):
-            aten_q_ops = [
-                "relu",
-                "_empty_affine_quantized",
-                "dequantize",
-                "quantize_per_tensor",
-                "upsample_nearest2d",
-                "avg_pool2d",
-                "reshape",
-                "slice",
-                "cat",
-                "max_pool2d",
-                "sigmoid",
-            ]
-            if op[0] in aten_q_ops:
-                symbolic_registry.register_op(op[0], op[1], "", version)
-            symbolic_registry.register_op(op[0], op[1], domain, version)
+            if op in aten_q_ops:
+                # Override the builtin aten ops
+                registration.registry.register(
+                    f"aten::{op}", version, func, custom=True
+                )
+            registration.registry.register(name, version, func)
 
 
 def _permute_helper(g, input, axes):
diff --git a/torch/onnx/symbolic_opset17.py b/torch/onnx/symbolic_opset17.py
index 8ec2eefbd655..9ebeb58436ca 100644
--- a/torch/onnx/symbolic_opset17.py
+++ b/torch/onnx/symbolic_opset17.py
@@ -23,6 +23,8 @@
 # EDITING THIS FILE? READ THIS FIRST!
 # see Note [Edit Symbolic Files] in symbolic_helper.py
 
+__all__ = ["layer_norm"]
+
 
 @symbolic_helper.parse_args("v", "is", "v", "v", "f", "none")
 def layer_norm(
diff --git a/torch/onnx/symbolic_registry.py b/torch/onnx/symbolic_registry.py
deleted file mode 100644
index eb88c4f83289..000000000000
--- a/torch/onnx/symbolic_registry.py
+++ /dev/null
@@ -1,168 +0,0 @@
-import importlib
-import inspect
-import itertools
-import warnings
-from typing import Any, Callable, Dict, Tuple, Union
-
-from torch import _C
-from torch.onnx import _constants, errors
-
-__all__ = [
-    "get_op_supported_version",
-    "get_ops_in_version",
-    "get_registered_op",
-    "is_registered_op",
-    "is_registered_version",
-    "register_op",
-    "register_ops_helper",
-    "register_ops_in_version",
-    "register_version",
-    "unregister_op",
-]
-
-_SymbolicFunction = Callable[..., Union[_C.Value, Tuple[_C.Value]]]
-
-"""
-The symbolic registry "_registry" is a dictionary that maps operators
-(for a specific domain and opset version) to their symbolic functions.
-An operator is defined by its domain, opset version, and opname.
-The keys are tuples (domain, version), (where domain is a string, and version is an int),
-and the operator's name (string).
-The map's entries are as follows : _registry[(domain, version)][op_name] = op_symbolic
-"""
-_registry: Dict[
-    Tuple[str, int],
-    Dict[str, _SymbolicFunction],
-] = {}
-
-_symbolic_versions: Dict[Union[int, str], Any] = {}
-
-
-def _import_symbolic_opsets():
-    for opset_version in itertools.chain(
-        _constants.onnx_stable_opsets, [_constants.onnx_main_opset]
-    ):
-        module = importlib.import_module(f"torch.onnx.symbolic_opset{opset_version}")
-        global _symbolic_versions
-        _symbolic_versions[opset_version] = module
-
-
-def register_version(domain: str, version: int):
-    if not is_registered_version(domain, version):
-        global _registry
-        _registry[(domain, version)] = {}
-    register_ops_in_version(domain, version)
-
-
-def register_ops_helper(domain: str, version: int, iter_version: int):
-    for domain, op_name, op_func in get_ops_in_version(iter_version):
-        if not is_registered_op(op_name, domain, version):
-            register_op(op_name, op_func, domain, version)
-
-
-def register_ops_in_version(domain: str, version: int):
-    """Iterates through the symbolic functions of the specified opset version, and the
-    previous opset versions for operators supported in previous versions.
-
-    Opset 9 is the base version. It is selected as the base version because
-        1. It is the first opset version supported by PyTorch export.
-        2. opset 9 is more robust than previous opset versions. Opset versions like 7/8 have limitations
-            that certain basic operators cannot be expressed in ONNX. Instead of basing on these limitations,
-            we chose to handle them as special cases separately.
-
-    Backward support for opset versions beyond opset 7 is not in our roadmap.
-
-    For opset versions other than 9, by default they will inherit the symbolic functions defined in
-    symbolic_opset9.py.
-
-    To extend support for updated operators in different opset versions on top of opset 9,
-    simply add the updated symbolic functions in the respective symbolic_opset{version}.py file.
-    Checkout topk in symbolic_opset10.py, and upsample_nearest2d in symbolic_opset8.py for example.
-    """
-    iter_version = version
-    while iter_version != 9:
-        register_ops_helper(domain, version, iter_version)
-        if iter_version > 9:
-            iter_version = iter_version - 1
-        else:
-            iter_version = iter_version + 1
-
-    register_ops_helper(domain, version, 9)
-
-
-def get_ops_in_version(version: int):
-    if not _symbolic_versions:
-        _import_symbolic_opsets()
-    members = inspect.getmembers(_symbolic_versions[version])
-    domain_opname_ops = []
-    for obj in members:
-        if isinstance(obj[1], type) and hasattr(obj[1], "domain"):
-            ops = inspect.getmembers(obj[1], predicate=inspect.isfunction)
-            for op in ops:
-                domain_opname_ops.append((obj[1].domain, op[0], op[1]))  # type: ignore[attr-defined]
-
-        elif inspect.isfunction(obj[1]):
-            if obj[0] == "_len":
-                obj = ("len", obj[1])
-            if obj[0] == "_list":
-                obj = ("list", obj[1])
-            if obj[0] == "_any":
-                obj = ("any", obj[1])
-            if obj[0] == "_all":
-                obj = ("all", obj[1])
-            domain_opname_ops.append(("", obj[0], obj[1]))
-    return domain_opname_ops
-
-
-def is_registered_version(domain: str, version: int):
-    global _registry
-    return (domain, version) in _registry
-
-
-def register_op(opname, op, domain, version):
-    if domain is None or version is None:
-        warnings.warn(
-            "ONNX export failed. The ONNX domain and/or version to register are None."
-        )
-    global _registry
-    if not is_registered_version(domain, version):
-        _registry[(domain, version)] = {}
-    _registry[(domain, version)][opname] = op
-
-
-def is_registered_op(opname: str, domain: str, version: int):
-    if domain is None or version is None:
-        warnings.warn("ONNX export failed. The ONNX domain and/or version are None.")
-    global _registry
-    return (domain, version) in _registry and opname in _registry[(domain, version)]
-
-
-def unregister_op(opname: str, domain: str, version: int):
-    global _registry
-    if is_registered_op(opname, domain, version):
-        del _registry[(domain, version)][opname]
-        if not _registry[(domain, version)]:
-            del _registry[(domain, version)]
-    else:
-        warnings.warn("The opname " + opname + " is not registered.")
-
-
-def get_op_supported_version(opname: str, domain: str, version: int):
-    iter_version = version
-    while iter_version <= _constants.onnx_main_opset:
-        ops = [(op[0], op[1]) for op in get_ops_in_version(iter_version)]
-        if (domain, opname) in ops:
-            return iter_version
-        iter_version += 1
-    return None
-
-
-def get_registered_op(opname: str, domain: str, version: int) -> _SymbolicFunction:
-    if domain is None or version is None:
-        warnings.warn("ONNX export failed. The ONNX domain and/or version are None.")
-    global _registry
-    if not is_registered_op(opname, domain, version):
-        raise errors.UnsupportedOperatorError(
-            domain, opname, version, get_op_supported_version(opname, domain, version)
-        )
-    return _registry[(domain, version)][opname]
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index cda85c481991..832b949226b7 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -9,7 +9,6 @@
 import copy
 import inspect
 import io
-import itertools
 import os
 import re
 import textwrap
@@ -44,10 +43,9 @@
     errors,
     symbolic_caffe2,
     symbolic_helper,
-    symbolic_registry,
 )
 from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import _beartype
+from torch.onnx._internal import _beartype, registration
 
 __all__ = [
     "is_in_onnx_export",
@@ -60,7 +58,6 @@
     "unpack_quantized_tensor",
     "export_to_pretty_string",
     "unconvertible_ops",
-    "get_ns_op_name_from_custom_op",
     "register_custom_op_symbolic",
     "unregister_custom_op_symbolic",
 ]
@@ -1139,7 +1136,8 @@ def _model_to_graph(
 
     if (
         do_constant_folding
-        and GLOBALS.export_onnx_opset_version in _constants.onnx_constant_folding_opsets
+        and GLOBALS.export_onnx_opset_version
+        >= _constants.ONNX_CONSTANT_FOLDING_MIN_OPSET
     ):
         params_dict = _C._jit_pass_onnx_constant_fold(
             graph, params_dict, GLOBALS.export_onnx_opset_version
@@ -1204,7 +1202,7 @@ def export_to_pretty_string(
       A UTF-8 str containing a human-readable representation of the ONNX model.
     """
     if opset_version is None:
-        opset_version = _constants.onnx_default_opset
+        opset_version = _constants.ONNX_DEFAULT_OPSET
     if custom_opsets is None:
         custom_opsets = {}
     symbolic_helper._set_opset_version(opset_version)
@@ -1265,7 +1263,7 @@ def unconvertible_ops(
         of the unconvertible ops.
     """
 
-    opset_version = opset_version or _constants.onnx_default_opset
+    opset_version = opset_version or _constants.ONNX_DEFAULT_OPSET
     symbolic_helper._set_opset_version(opset_version)
     # operator_export_type is set to ONNX_FALLTHROUGH by default so that if an op is not supported
     # in ONNX, fall through will occur and export the operator as is, as a custom ONNX op.
@@ -1279,7 +1277,7 @@ def unconvertible_ops(
             operator_export_type=_C_onnx.OperatorExportTypes.ONNX_FALLTHROUGH,
         )
     unsupported_ops = list()
-    supported_namespaces = ("onnx", "prim", "quantized")
+    supported_namespaces = {"onnx", "prim", "quantized"}
     for node in graph.nodes():
         if node.kind().split(":")[0] not in supported_namespaces:
             unsupported_ops.append(node.kind())
@@ -1418,7 +1416,7 @@ def _export(
         symbolic_helper._set_onnx_shape_inference(onnx_shape_inference)
 
         if opset_version is None:
-            opset_version = _constants.onnx_default_opset
+            opset_version = _constants.ONNX_DEFAULT_OPSET
 
         if export_modules_as_functions and opset_version < 15:
             raise ValueError(
@@ -1690,37 +1688,10 @@ def _add_output_to_block(block: _C.Block, value: _C.Value):
 
 
 @_beartype.beartype
-def _find_symbolic_in_registry(
-    domain: str,
-    op_name: str,
-    opset_version: int,
-    operator_export_type: _C_onnx.OperatorExportTypes,
-) -> Optional[Callable]:
-    """Looks up for the symbolic function in the registry.
-
-    Args:
-        domain: The domain of the symbolic function.
-        op_name: The name of the op.
-        opset_version: Currect opset used.
-        operator_export_type: An enum in _C_onnx.OperatorExportTypes.
-
-    Returns:
-        The symbolic function if found, None otherwise.
-    """
-
-    if not symbolic_registry.is_registered_op(op_name, domain, opset_version):
-        if operator_export_type == _C_onnx.OperatorExportTypes.ONNX_FALLTHROUGH:
-            # Use the original node directly
-            return None
-    return symbolic_registry.get_registered_op(op_name, domain, opset_version)
-
-
-@_beartype.beartype
-def _should_aten_fallback(ns, op_name, opset_version, operator_export_type):
-
-    is_exportable_aten_op = symbolic_registry.is_registered_op(
-        op_name, "", opset_version
-    )
+def _should_aten_fallback(
+    name: str, opset_version: int, operator_export_type: _C_onnx.OperatorExportTypes
+):
+    is_exportable_aten_op = registration.registry.is_registered_op(name, opset_version)
     is_onnx_aten_export = operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN
     is_aten_fallback_export = (
         operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
@@ -1787,64 +1758,59 @@ def _run_symbolic_function(
     namespace, op_name = ns_op_name.split("::")
 
     try:
-        symbolic_registry.register_version("", opset_version)
-
         # Caffe2-specific: Quantized op symbolics are registered for opset 9 only.
         if symbolic_helper.is_caffe2_aten_fallback() and opset_version == 9:
             symbolic_caffe2.register_quantized_ops("caffe2", opset_version)
 
-        if namespace == "aten":
-            domain = ""
-        elif namespace == "quantized" and symbolic_helper.is_caffe2_aten_fallback():
+        if namespace == "quantized" and symbolic_helper.is_caffe2_aten_fallback():
             domain = "caffe2"
         else:
             domain = namespace
+        symbolic_function_name = f"{domain}::{op_name}"
 
-        if symbolic_registry.is_registered_op(op_name, domain, opset_version):
-            symbolic_fn = _find_symbolic_in_registry(
-                domain, op_name, opset_version, operator_export_type
-            )
-            assert symbolic_fn is not None
-
-            attrs = {k: symbolic_helper._node_get(n, k) for k in n.attributeNames()}
-            if _need_symbolic_context(symbolic_fn):
-                ctx = _exporter_states.SymbolicContext(_params_dict, env, n, block)
-                return symbolic_fn(ctx, g, *inputs, **attrs)
-            # PythonOp symbolic need access to the node to resolve the name conflict,
-            # this is inconsistent with regular op symbolic.
-            if op_name == "PythonOp":
-                inputs = (n, *inputs)
-            return symbolic_fn(g, *inputs, **attrs)
-        elif namespace == "onnx":
+        symbolic_function_group = registration.registry.get_function_group(
+            symbolic_function_name
+        )
+        if symbolic_function_group is not None:
+            symbolic_fn = symbolic_function_group.get(opset_version)
+            if symbolic_fn is not None:
+                attrs = {k: symbolic_helper._node_get(n, k) for k in n.attributeNames()}
+                if _need_symbolic_context(symbolic_fn):
+                    # TODO(justinchuby): Refactor how we check for the need of the symbolic context
+                    ctx = _exporter_states.SymbolicContext(_params_dict, env, n, block)
+                    return symbolic_fn(ctx, g, *inputs, **attrs)
+                # PythonOp symbolic need access to the node to resolve the name conflict,
+                # this is inconsistent with regular op symbolic.
+                if op_name == "PythonOp":
+                    inputs = (n, *inputs)
+                return symbolic_fn(g, *inputs, **attrs)
+
+        attrs = {
+            k + "_" + n.kindOf(k)[0]: symbolic_helper._node_get(n, k)
+            for k in n.attributeNames()
+        }
+        if namespace == "onnx":
             # Clone node to trigger ONNX shape inference
-            attrs = {
-                k + "_" + n.kindOf(k)[0]: symbolic_helper._node_get(n, k)
-                for k in n.attributeNames()
-            }
             return g.op(op_name, *inputs, **attrs, outputs=n.outputsSize())  # type: ignore[attr-defined]
-        elif _should_aten_fallback(
-            namespace, op_name, opset_version, operator_export_type
-        ):
+
+        if _should_aten_fallback(ns_op_name, opset_version, operator_export_type):
             # Direct ATen export requested
-            attrs = {
-                k + "_" + n.kindOf(k)[0]: symbolic_helper._node_get(n, k)
-                for k in n.attributeNames()
-            }
             outputs = n.outputsSize()
             attrs["outputs"] = outputs
             # `overload_name` is set for non-Caffe2 builds only
             return g.at(  # type: ignore[attr-defined]
                 op_name, *inputs, overload_name=_get_aten_op_overload_name(n), **attrs
             )
-        else:
-            raise errors.UnsupportedOperatorError(
-                domain,
-                op_name,
-                opset_version,
-                symbolic_registry.get_op_supported_version(
-                    op_name, domain, opset_version
-                ),
-            )
+
+        raise errors.UnsupportedOperatorError(
+            domain,
+            op_name,
+            opset_version,
+            symbolic_function_group.get_min_supported()
+            if symbolic_function_group
+            else None,
+        )
+
     except RuntimeError:
         if operator_export_type == _C_onnx.OperatorExportTypes.ONNX_FALLTHROUGH:
             return None
@@ -1869,31 +1835,26 @@ def _run_symbolic_function(
 
 
 @_beartype.beartype
-def get_ns_op_name_from_custom_op(symbolic_name):
-    if not bool(
-        re.match(r"^[a-zA-Z0-9-_]*::[a-zA-Z-_]+[a-zA-Z0-9-_]*$", symbolic_name)
-    ):
-        raise ValueError(
-            f"Failed to register operator {symbolic_name}."
-            "The symbolic name must match the format Domain::Name, "
+def _verify_custom_op_name(symbolic_name: str):
+    if not re.match(r"^[a-zA-Z0-9-_]+::[a-zA-Z-_]+[a-zA-Z0-9-_]*$", symbolic_name):
+        raise errors.OnnxExporterError(
+            f"Failed to register operator {symbolic_name}. "
+            "The symbolic name must match the format domain::name, "
             "and should start with a letter and contain only "
             "alphanumerical characters"
         )
 
-    ns, op_name = symbolic_name.split("::")
+    ns, _ = symbolic_name.split("::")
     if ns == "onnx":
         raise ValueError(
             f"Failed to register operator {symbolic_name}. {ns} domain cannot be modified."
         )
 
-    if ns == "aten":
-        ns = ""
-
-    return ns, op_name
-
 
 @_beartype.beartype
-def register_custom_op_symbolic(symbolic_name, symbolic_fn, opset_version):
+def register_custom_op_symbolic(
+    symbolic_name: str, symbolic_fn: Callable, opset_version: int
+):
     """Registers a symbolic function for a custom operator.
 
     When the user registers symbolic for custom/contrib ops,
@@ -1911,13 +1872,15 @@ def register_custom_op_symbolic(symbolic_name, symbolic_fn, opset_version):
             operator nodes to add to the graph.
         opset_version (int): The ONNX opset version in which to register.
     """
-    ns, op_name = get_ns_op_name_from_custom_op(symbolic_name)
+    if symbolic_name.startswith("::"):
+        symbolic_name = f"aten{symbolic_name}"
 
-    for version in itertools.chain(
-        _constants.onnx_stable_opsets, [_constants.onnx_main_opset]
-    ):
-        if version >= opset_version:
-            symbolic_registry.register_op(op_name, symbolic_fn, ns, version)
+    _verify_custom_op_name(symbolic_name)
+
+    versions = range(
+        max(_constants.ONNX_MIN_OPSET, opset_version), _constants.ONNX_MAX_OPSET + 1
+    )
+    registration.custom_onnx_symbolic(symbolic_name, versions)(symbolic_fn)
 
 
 @_beartype.beartype
@@ -1931,13 +1894,14 @@ def unregister_custom_op_symbolic(symbolic_name: str, opset_version: int):
             format.
         opset_version (int): The ONNX opset version in which to unregister.
     """
-    ns, op_name = get_ns_op_name_from_custom_op(symbolic_name)
+    if symbolic_name.startswith("::"):
+        symbolic_name = f"aten{symbolic_name}"
 
-    for version in itertools.chain(
-        _constants.onnx_stable_opsets, [_constants.onnx_main_opset]
-    ):
+    _verify_custom_op_name(symbolic_name)
+
+    for version in range(_constants.ONNX_MIN_OPSET, _constants.ONNX_MAX_OPSET + 1):
         if version >= opset_version:
-            symbolic_registry.unregister_op(op_name, ns, version)
+            registration.registry.unregister(symbolic_name, version)
 
 
 @_beartype.beartype
diff --git a/torch/onnx/verification.py b/torch/onnx/verification.py
index fc735fe5fd1c..0aa585fe44b1 100644
--- a/torch/onnx/verification.py
+++ b/torch/onnx/verification.py
@@ -519,7 +519,7 @@ def _onnx_graph_from_model(
     output_names = export_options.output_names
 
     if opset_version is None:
-        opset_version = _constants.onnx_default_opset
+        opset_version = _constants.ONNX_DEFAULT_OPSET
 
     utils._setup_trace_module_map(model, export_modules_as_functions)
 
diff --git a/torch/overrides.py b/torch/overrides.py
index 8ea0ae8c0226..8cd690f93037 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -207,6 +207,7 @@ def get_ignored_functions() -> Set[Callable]:
         torch.nn.init.kaiming_normal,
         torch.nn.init.orthogonal,
         torch.nn.init.sparse,
+        torch.nested.to_padded_tensor,
         has_torch_function,
         handle_torch_function,
         torch.set_autocast_enabled,
@@ -254,6 +255,7 @@ def get_ignored_functions() -> Set[Callable]:
         Tensor.__subclasshook__,
         Tensor.__hash__,
         Tensor.as_subclass,
+        Tensor.eig,
         Tensor.reinforce,
         Tensor.new,
         Tensor.new_tensor,
@@ -281,7 +283,6 @@ def get_ignored_functions() -> Set[Callable]:
         Tensor._is_zerotensor,
         Tensor._addmm_activation,
         Tensor._nested_tensor_layer_norm,
-        Tensor.to_padded_tensor,
     }
 
 
@@ -411,6 +412,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.cartesian_prod: lambda *tensors: -1,
         torch.cat: lambda tensors, dim=0, out=None: -1,
         torch.concat: lambda tensors, dim=0, out=None: -1,  # alias for torch.cat
+        torch.concatenate: lambda tensors, dim=0, out=None: -1,  # alias for torch.concatenate
         torch.cdist: lambda x1, x2, p=2.0, compute_mode='use_mm_for_euclid_dist_if_necessary': -1,
         torch.ceil: lambda input, out=None: -1,
         torch.celu: lambda input, alhpa=1., inplace=False: -1,
@@ -487,7 +489,6 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.hsmm: lambda mat1, mat2: -1,
         torch.dsplit: lambda input, indices_or_sections: -1,
         torch.dstack: lambda tensors, out=None: -1,
-        torch.eig: lambda input, eigenvectors=False, out=None: -1,
         torch.linalg.eig: lambda input, out=None: -1,
         torch.linalg.eigvals: lambda input, out=None: -1,
         torch.linalg.eigh: lambda input, UPLO="L", out=None: -1,
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 3bd685c5d528..855740583ed7 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -12,7 +12,7 @@
 import torch
 from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, TEST_MKL, \
     skipCUDANonDefaultStreamIf, TEST_WITH_ASAN, TEST_WITH_UBSAN, TEST_WITH_TSAN, \
-    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, IS_WINDOWS, DeterministicGuard, \
+    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, IS_WINDOWS, \
     _TestParametrizer, compose_parametrize_fns, dtype_name, \
     TEST_WITH_MIOPEN_SUGGEST_NHWC, NATIVE_DEVICES, skipIfTorchDynamo
 from torch.testing._internal.common_cuda import _get_torch_cuda_version, \
@@ -1161,70 +1161,11 @@ def expectedFailureCUDA(fn):
     return expectedFailure('cuda')(fn)
 
 def expectedFailureMeta(fn):
-    return skipIfTorchDynamo(expectedFailure('meta')(fn))
+    return skipIfTorchDynamo()(expectedFailure('meta')(fn))
 
 def expectedFailureXLA(fn):
     return expectedFailure('xla')(fn)
 
-# This decorator checks that the decorated function produces a nondeterministic
-# alert for the expected device types
-class expectedAlertNondeterministic:
-    # Args:
-    #
-    #   caller_name (str): Name of the operation that produces the
-    #       nondeterministic alert. This name is expected to appear
-    #       in the error/warning message.
-    #
-    #   device_types (list[str], optional): If provided, the alert is
-    #       expected to only be triggered for the specified devices, and
-    #       no others. If None, then the alert is expected to be triggered
-    #       for all devices. Default: None
-    #
-    def __init__(self, caller_name, device_types=None):
-        if device_types is not None:
-            assert isinstance(device_types, list)
-            for device_type in device_types:
-                assert isinstance(device_type, str)
-        self.device_types = device_types
-        self.error_message = caller_name + ' does not have a deterministic implementation, but you set'
-
-    def __call__(self, fn):
-        @wraps(fn)
-        def efail_fn(slf, device, *args, **kwargs):
-            should_alert = self.device_types is None or slf.device_type in self.device_types
-
-            # Check that errors are thrown correctly
-            with DeterministicGuard(True):
-                if should_alert:
-                    with slf.assertRaisesRegex(
-                            RuntimeError,
-                            self.error_message,
-                            msg='expected a non-deterministic error, but it was not raised'):
-                        fn(slf, device, *args, **kwargs)
-
-                else:
-                    # If a nondeterministic error is not expected, make sure
-                    # that it is not raised
-                    try:
-                        return fn(slf, device, *args, **kwargs)
-                    except RuntimeError as e:
-                        if 'does not have a deterministic implementation' in str(e):
-                            slf.fail(
-                                'did not expect non-deterministic error message, '
-                                + 'but got one anyway: "' + str(e) + '"')
-                        # Reraise exceptions unrelated to nondeterminism
-                        raise
-
-            # Check that warnings are thrown correctly
-            if should_alert:
-                with DeterministicGuard(True, warn_only=True):
-                    with slf.assertWarnsRegex(
-                            UserWarning,
-                            self.error_message):
-                        fn(slf, device, *args, **kwargs)
-
-        return efail_fn
-
 # Skips a test on CPU if LAPACK is not available.
 def skipCPUIfNoLapack(fn):
     return skipCPUIf(not torch._C.has_lapack, "PyTorch compiled without Lapack")(fn)
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 0d4ef1753bc0..adb71109c18b 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -73,17 +73,17 @@ class TestSkip(NamedTuple):
 class DistTestCases:
     # Backends that do not support a specific collective
     skip_collective = {}
-    skip_collective["allgather_coalesced"] = {"nccl", "mpi"}
+    skip_collective["allgather_coalesced"] = {"nccl", "mpi", "ucc"}
     skip_collective["reduce"] = set()
-    skip_collective["sendrecv anysource"] = {"nccl"}
-    skip_collective["cpu barrier"] = {"nccl"}
+    skip_collective["sendrecv anysource"] = {"nccl", "ucc"}
+    skip_collective["cpu barrier"] = {"nccl", "ucc"}
 
     # Sets showing that something is implemented
     backend_feature = {}
-    backend_feature["gpu"] = {"nccl", "gloo"}
-    backend_feature["cuda"] = {"nccl", "gloo"}
-    backend_feature["ddp"] = {"nccl", "gloo"}
-    backend_feature["subgroup"] = {"nccl", "gloo"}
+    backend_feature["gpu"] = {"nccl", "gloo"}  # TODO(ucc): add sequence number support to ucc and enable it here
+    backend_feature["cuda"] = {"nccl", "gloo", "ucc"}
+    backend_feature["ddp"] = {"nccl", "gloo", "ucc"}
+    backend_feature["subgroup"] = {"nccl", "gloo", "ucc"}
     backend_feature["plugin"] = set()
 
 
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index eac2ec44bb3f..f2e6d92b1cb8 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -609,19 +609,19 @@ def forward(self, x):
         if self.delay_before_free_ms > 0:
             expert = self.module[2]
             if isinstance(expert, FSDP):
-                orig_free_full_params = self.module[2]._free_full_params
+                orig_reshard = self.module[2]._reshard
 
                 def _free_full_params_with_delay(*args):
                     torch.cuda._sleep(
                         int(self.delay_before_free_ms * get_cycles_per_ms())
                     )
-                    return orig_free_full_params(*args)
+                    return orig_reshard(*args)
 
                 assert hasattr(
-                    expert, "_free_full_params"
-                ), "expert FSDP module should has _free_full_params attribute."
+                    expert, "_reshard"
+                ), "expert FSDP module should have a `_reshard()` method"
                 with mock.patch.object(
-                    expert, "_free_full_params", _free_full_params_with_delay
+                    expert, "_reshard", _free_full_params_with_delay
                 ):
                     return self.module(x)
 
@@ -713,9 +713,6 @@ def _check_cpu_offload(self, fsdp_model, cpu_offload):
     def _check_backward_prefetch(self, fsdp_model, backward_prefetch):
         self.assertEqual(backward_prefetch, fsdp_model.backward_prefetch)
 
-    def _check_forward_prefetch(self, fsdp_model, forward_prefetch):
-        self.assertEqual(forward_prefetch, fsdp_model.forward_prefetch)
-
     def run_subtests(
         self,
         subtest_config: Dict[str, List[Any]],
@@ -823,8 +820,7 @@ def _train_for_several_steps(
                 # Post-forward, if CPU offloading model param should be on CPU.
                 if cpu_offload_params and isinstance(model, FSDP):
                     for p in model.parameters():
-                        # Params should always be on CPU, even if
-                        # p._is_sharded=False
+                        # Params should always be on CPU
                         self.assertEqual(p.device, torch.device("cpu"))
 
                 loss = model.module.get_loss(input, output).to(model_device)
@@ -860,8 +856,7 @@ def _train_for_several_steps(
             # Post-backward, if CPU offloading model params should be on CPU.
             if cpu_offload_params and isinstance(model, FSDP):
                 for p in model.parameters():
-                    # Params should always be on CPU, even if
-                    # p._is_sharded=False
+                    # Params should always be on CPU
                     self.assertEqual(p.device, torch.device("cpu"))
             # Unscale the gradients and step
             sharded_grad_scaler.step(optim)
@@ -889,7 +884,6 @@ def _test_fsdp_parity(
         save_model: bool = True,
         cpu_offload: CPUOffload = CPUOffload(),
         backward_prefetch: Optional[BackwardPrefetch] = None,
-        forward_prefetch: bool = False,
         sharding_strategy: Optional[ShardingStrategy] = None,
         mixed_precision: Optional[MixedPrecision] = None,
         enable_sharded_grad_scaler: bool = False,
@@ -948,7 +942,6 @@ def _test_fsdp_parity(
             {
                 "cpu_offload": cpu_offload,
                 "backward_prefetch": backward_prefetch,
-                "forward_prefetch": forward_prefetch,
                 "sharding_strategy": sharding_strategy,
                 "mixed_precision": mixed_precision,
             }
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 3f152354e6d2..366403cdc05e 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -10,7 +10,7 @@
 
 import torch
 import numpy as np
-from torch._six import inf
+from torch._six import inf, nan
 
 from typing import Any, Dict, List, Tuple, Union
 from torch.testing import make_tensor
@@ -21,7 +21,7 @@
 )
 from torch.testing._internal.common_device_type import \
     (onlyCUDA, onlyNativeDeviceTypes, disablecuDNN, skipCUDAIfNoMagma, skipCUDAIfNoMagmaAndNoCusolver,
-     skipCUDAIfNoCusolver, skipCPUIfNoLapack, skipCPUIfNoFFT, skipCUDAIfRocm, skipCUDAIf, precisionOverride,
+     skipCUDAIfNoCusolver, skipCPUIfNoLapack, skipCPUIfNoFFT, skipCUDAIf, precisionOverride,
      skipCPUIfNoMklSparse,
      toleranceOverride, tol)
 from torch.testing._internal.common_cuda import (
@@ -38,7 +38,6 @@
 import torch._refs.nn.functional
 import torch._refs.special
 import torch._refs.linalg
-
 import torch._prims as prims  # noqa: F401
 
 from torch.utils._pytree import tree_flatten
@@ -751,6 +750,27 @@ def to_float(start, end, step):
     yield SampleInput(1, args=(3, 1))
 
 
+def sample_inputs_uniform(op, device, dtype, requires_grad, **kwargs):
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=False)
+    samples = (
+        ((M,), -100, 100),
+        ((S, S), 0, 1),
+        ((S, S, S), 1, 2),
+    )
+    for shape, hi, lo in samples:
+        yield SampleInput(make_arg(shape), args=(hi, lo))
+
+
+def error_inputs_uniform(op, device, **kwargs):
+    t = torch.zeros([10], device=device)
+    yield ErrorInput(
+        SampleInput(t, args=(3, -1)),
+        error_type=RuntimeError,
+        error_regex=r"uniform_ expects to return a \[from, to\) range, but found from=3 > to=-1",
+    )
+
+
 def error_inputs_linspace(op, device, **kwargs):
     yield ErrorInput(SampleInput(0, args=(3, -1)), error_type=RuntimeError, error_regex='number of steps must be non-negative')
     yield ErrorInput(SampleInput(0, args=(3, 1.)), error_type=TypeError, error_regex='must be int, not float')
@@ -1550,6 +1570,13 @@ def sample_inputs_bernoulli(self, device, dtype, requires_grad, **kwargs):
         samples.append(SampleInput(t))
     return tuple(samples)
 
+def error_inputs_bernoulli(op_info, device, **kwargs):
+    # more than one element of the written-to tensor refers to a single memory location
+    x = torch.rand((1,), device=device).expand((6,))
+    err_msg = 'unsupported operation'
+    yield ErrorInput(SampleInput(torch.rand_like(x), kwargs={'out': x}),
+                     error_regex=err_msg)
+
 def sample_inputs_logcumsumexp(self, device, dtype, requires_grad, **kwargs):
     inputs = (
         ((S, S, S), 0),
@@ -1866,6 +1893,75 @@ def sample_inputs_cat_concat(op_info, device, dtype, requires_grad, **kwargs):
     for input_shape1, input_shape2, kwargs in cases:
         yield SampleInput([make_arg(input_shape1), make_arg(input_shape2)], kwargs=kwargs)
 
+def error_inputs_cat(op_info, device, **kwargs):
+
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # error inputs for more than one element of the written-to tensor refer to a single memory location
+    yield ErrorInput(SampleInput([make_arg((S, S)), make_arg((S, S))],
+                                 kwargs={'out': make_arg((1, S)).expand((2 * S, S))}),
+                     error_regex='unsupported operation')
+
+    # error inputs for empty tensors
+    yield ErrorInput(SampleInput([], kwargs={'dim': 1}),
+                     error_regex='non-empty list of Tensors')
+
+    # error inputs for different sizes
+    yield ErrorInput(SampleInput([make_arg((S, S, L, L)), make_arg((S, 0, L - 1, L))], kwargs={'dim': 1}),
+                     error_regex='Sizes of tensors must match except in dimension')
+    yield ErrorInput(SampleInput([make_arg((S, 0, L - 1, L)), make_arg((S, S, L, L))], kwargs={'dim': 1}),
+                     error_regex='Sizes of tensors must match except in dimension')
+
+    # error inputs for different dimensions
+    yield ErrorInput(SampleInput([make_arg((S - 1, 0)), make_arg((S, 0, L - 1, L))], kwargs={'dim': 1}),
+                     error_regex='Tensors must have same number of dimensions')
+    yield ErrorInput(SampleInput([make_arg((S, 0, L - 1, L)), make_arg((S - 1, 0))], kwargs={'dim': 1}),
+                     error_regex='Tensors must have same number of dimensions')
+
+    # error inputs for same memory locations
+    x = torch.zeros((0), device=device)
+    y = torch.randn((4, 6), device=device)
+
+    err_msg = "the written-to tensor refer to a single memory location"
+
+    yield ErrorInput(SampleInput((x, y), kwargs={'dim': 0, 'out': x}),
+                     error_regex=err_msg)
+    yield ErrorInput(SampleInput((x, y), kwargs={'dim': 0, 'out': y}),
+                     error_regex=err_msg)
+
+    z = torch.zeros((4, 6), device=device)
+    yield ErrorInput(SampleInput((y, z), kwargs={'out': z[:2, :]}),
+                     error_regex=err_msg)
+
+    # error inputs for different devices
+    if torch.device(device).type == 'cuda':
+        x_cuda = make_tensor((3, 3), device=device, dtype=torch.float32)
+        y_cpu = make_tensor((3, 3), device='cpu', dtype=torch.float32)
+        yield ErrorInput(SampleInput((x_cuda, y_cpu)),
+                         error_regex='Expected all tensors to be on the same device')
+
+    # error inputs for different input sizes for more than 2 tensors
+    yield ErrorInput(SampleInput([make_arg((L, 1)), make_arg((L, 1, 1)), make_arg((L, 1, 1))]),
+                     error_regex='Tensors must have same number of dimensions')
+
+    yield ErrorInput(SampleInput([make_arg((S, 1, M)), make_arg((S, 1, 1)), make_arg((S, M, 1))],
+                                 kwargs={'dim': 1}),
+                     error_regex='Sizes of tensors must match')
+
+    # error inputs for None input
+    yield ErrorInput(SampleInput((make_arg((S, 1, 1)), None)), error_type=TypeError,
+                     error_regex='got None')
+
+    # error inputs for zero-dimensional tensors
+    yield ErrorInput(SampleInput([make_arg(()), make_arg(())]),
+                     error_regex='zero-dimensional.*cannot be concatenated')
+
+    # error inputs for different dtype of out tensors
+    d = make_tensor((2, 3), device=device, dtype=torch.double)
+    x = make_tensor((2, 3), device=device, dtype=torch.float32)
+    yield ErrorInput(SampleInput(x, kwargs={'out': d}), error_type=TypeError,
+                     error_regex='invalid combination of arguments')
+
 def reference_inputs_cat(op, device, dtype, requires_grad, **kwargs):
     yield from sample_inputs_cat_concat(op, device, dtype, requires_grad, **kwargs)
 
@@ -2126,14 +2222,6 @@ def error_inputs_renorm(op_info, device, **kwargs):
     yield ErrorInput(SampleInput(zero_d, args=(0.5, 0, 1.0)), error_type=RuntimeError,
                      error_regex="needs at least 2 dimensions, got 0 dimensions")
 
-def error_inputs_eig(op_info, device, **kwargs):
-    zero_d = torch.randn((), device=device)
-
-    yield ErrorInput(SampleInput(zero_d, args=(False,)), error_type=RuntimeError,
-                     error_regex="input should be 2 dimensional")
-
-    yield ErrorInput(SampleInput(zero_d, args=(True,)), error_type=RuntimeError,
-                     error_regex="input should be 2 dimensional")
 
 def error_inputs_ormqr(op_info, device, **kwargs):
     # this is only implemented on cpu
@@ -2171,34 +2259,64 @@ def error_inputs_t(op_info, device, **kwargs):
 
 def error_inputs_multinomial(op_info, device, **kwargs):
     x = torch.empty(1, 2, 3, dtype=torch.double, device=device)
-    yield ErrorInput(SampleInput(x, args=(2,)), error_type=RuntimeError,
+    yield ErrorInput(SampleInput(x, args=(2,)),
                      error_regex="prob_dist must be 1 or 2 dim")
 
     x = torch.empty(1, 2, dtype=torch.long, device=device)
-    yield ErrorInput(SampleInput(x, args=(2,)), error_type=RuntimeError,
+    yield ErrorInput(SampleInput(x, args=(2,)),
                      error_regex="multinomial only supports floating-point dtypes for input")
 
     x = torch.empty(1, 2, dtype=torch.double, device=device)
     y = torch.empty(1, 2, dtype=torch.double, device=device)
-    yield ErrorInput(SampleInput(x, args=(2,), kwargs=dict(out=y)), error_type=RuntimeError,
+    yield ErrorInput(SampleInput(x, args=(2,), kwargs=dict(out=y)),
                      error_regex="multinomial expects Long tensor out")
 
     x = torch.empty(2, dtype=torch.double, device=device)
-    yield ErrorInput(SampleInput(x, args=(0,)), error_type=RuntimeError,
+    yield ErrorInput(SampleInput(x, args=(0,)),
                      error_regex="cannot sample n_sample <= 0 samples")
 
     x = torch.empty(2, dtype=torch.double, device=device)
-    yield ErrorInput(SampleInput(x, args=(-1,)), error_type=RuntimeError,
+    yield ErrorInput(SampleInput(x, args=(-1,)),
                      error_regex="cannot sample n_sample <= 0 samples")
 
     x = torch.empty(2, dtype=torch.double, device=device)
-    yield ErrorInput(SampleInput(x, args=(3, False,)), error_type=RuntimeError,
+    yield ErrorInput(SampleInput(x, args=(3, False,)),
                      error_regex="cannot sample n_sample > prob_dist")
 
     x = torch.empty(16777217, dtype=torch.double, device=device)
-    yield ErrorInput(SampleInput(x, args=(3,)), error_type=RuntimeError,
+    yield ErrorInput(SampleInput(x, args=(3,)),
                      error_regex="number of categories cannot exceed")
 
+    inputs = ((1., -1., 1.), (1., inf, 1.), (1., -inf, 1.), (1., 1., nan))
+
+    err_msg1 = "probability tensor contains either `inf`, `nan` or element < 0"
+    err_msg2 = "invalid multinomial distribution"
+
+    rep_arg = (False, True) if torch.device(device).type == 'cpu' else (False,)
+
+    for rep in rep_arg:
+        kwargs = {'num_samples': 2, 'replacement': rep}
+
+        for shape in inputs:
+            # error case when input tensor contains `inf`, `nan` or negative element
+            yield ErrorInput(SampleInput(torch.tensor(shape), kwargs=kwargs),
+                             error_regex=err_msg1 if rep is False else err_msg2)
+
+        # error case for the invalid multinomial distribution (sum of probabilities <= 0), 1-D input
+        x = torch.zeros(3, device=device)
+        yield ErrorInput(SampleInput(x, kwargs=kwargs),
+                         error_regex=err_msg2)
+
+        # error case for the invalid multinomial distribution (sum of probabilities <= 0), 2-D input
+        x = torch.zeros(3, 3, device=device)
+        yield ErrorInput(SampleInput(x, kwargs=kwargs),
+                         error_regex=err_msg2)
+
+        # error case for the invalid multinomial distribution
+        x[1, :] = 1
+        yield ErrorInput(SampleInput(x, kwargs=kwargs),
+                         error_regex=err_msg2)
+
 def error_inputs_gradient(op_info, device, **kwargs):
     for dtype in [torch.long, torch.float32, torch.complex64]:
         t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], device=device, dtype=dtype)
@@ -3082,6 +3200,7 @@ def sample_inputs_conv_transpose2d(op_info, device, dtype, requires_grad, **kwar
          {'stride': 2, 'padding': 1, 'output_padding': 1, 'groups': 1, 'dilation': (2, 3)}),
         ((1, 1, 4, 3), (1, 2, 3, 4), None,
          {'stride': 2, 'padding': 1, 'output_padding': 1, 'groups': 1}),
+        ((2, 8, 4, 4), (8, 1, 3, 3), None, {'groups': 4}),
         ((1, 4, 5, 5), (4, 8, 3, 3), None,
          {})
     )
@@ -3954,13 +4073,13 @@ def sample_inputs_dist(op_info, device, dtype, requires_grad, **kwargs):
 # https://github.com/pytorch/pytorch/issues/53352
 def sample_inputs_index(op_info, device, dtype, requires_grad, **kwargs):
     # target.index_select(dim, idx)
-    select = op_info.name == "index_select"
+    select = "index_select" in op_info.name
     # target.index_add(dim, idx, source, *, alpha=1)
-    add = op_info.name == "index_add"
+    add = "index_add" in op_info.name
     # target.index_copy(dim, idx, source)
-    copy = op_info.name == "index_copy"
+    copy = "index_copy" in op_info.name
     # target.index_fill(dim, idx, value)
-    fill = op_info.name == "index_fill"
+    fill = "index_fill" in op_info.name
 
 
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -3971,7 +4090,13 @@ def make_idx(n):
 
     shapes = [(), (1,), (S, S)]
     # extra parameter for add
-    alphas = (-1, 0, 2) if add else (None,)
+    if add:
+        if dtype == torch.bool:
+            alphas = (True, False)
+        else:
+            alphas = (-1, 0, 2)
+    else:
+        alphas = (None,)
 
     for shape, alpha in product(shapes, alphas):
         t = make_arg(shape)
@@ -4237,11 +4362,12 @@ def sample_repeat_tile(op_info, device, dtype, requires_grad, **kwargs):
         rep_dims = ((), (0, ), (0, 2), (1, 1), (2, 3), (1, 3, 2), (3, 1, 1))  # type: ignore[assignment]
         shapes = ((), (0,), (2,), (3, 2))  # type: ignore[assignment]
 
+    is_repeat_op = op_info.name in ['repeat', '_refs.repeat']
     samples = []
     for rep_dim, shape in product(rep_dims, shapes):
         # `torch.repeat` errors for `len(rep_dims) < t.dim()`,
         # so we filter such combinations.
-        if op_info.name == 'repeat' and len(rep_dim) < len(shape):
+        if is_repeat_op and len(rep_dim) < len(shape):
             continue
         samples.append(SampleInput(make_arg(shape), args=(rep_dim,),))
 
@@ -4261,6 +4387,8 @@ def sample_inputs_narrow(op_info, device, dtype, requires_grad, **kwargs):
         tensor = make_tensor(shape, dtype=dtype, device=device, low=None, high=None,
                              requires_grad=requires_grad)
         yield SampleInput(tensor, args=args)
+        # narrow accepts `start` argument to be a Tensor.
+        yield SampleInput(tensor, args=(args[0], torch.tensor(args[1]), args[2]))
 
 def sample_trapezoid(op_info, device, dtype, requires_grad, **kwargs):
     y_shape_x_shape_and_kwargs = [
@@ -4801,41 +4929,6 @@ def sample_inputs_hardtanh(op_info, device, dtype, requires_grad=False, **kwargs
 
     yield from sample_inputs_elementwise_unary(op_info, device, dtype, requires_grad)
 
-def sample_inputs_eig(op_info, device, dtype, requires_grad=False, **kwargs):
-    eigvecs = make_tensor((S, S), device=device, dtype=dtype,
-                          low=None, high=None)
-    eigvals = make_tensor((S,), device=device, dtype=dtype,
-                          low=None, high=None)
-    # we produce only diagonazible inputs which do not have
-    # complex eigenvalues for real inputs, as there is no
-    # backward implementation for real inputs with complex
-    # eigenvalues yet.
-    input = (eigvecs * eigvals.unsqueeze(-2)) @ eigvecs.inverse()
-    input.requires_grad_(requires_grad)
-
-    def process_output(eigpair):
-        eigvals, eigvecs = eigpair
-        if dtype.is_complex:
-            # eig produces eigenvectors which are normalized to 1 norm.
-            # Note that if v is an eigenvector, so is v * e^{i \phi},
-            # and |v| = |v * e^{i \phi}| = 1.
-            # This, however, makes the eigenvector backward computation process
-            # rather unstable unless the objective function is gauge-invariant,
-            # that is if f(z) == f(|z|), for example.
-            # Hence for complex inputs we ignore the phases and return only
-            # the absolute values.
-            return eigvals, eigvecs.abs()
-        else:
-            return eigvals, eigvecs
-
-    return [
-        SampleInput(
-            input,
-            kwargs=dict(eigenvectors=True),
-            output_process_fn_grad=process_output
-        ),
-    ]
-
 
 def sample_inputs_einsum(op_info, device, dtype, requires_grad=False, **kwargs):
     def c(t):
@@ -4900,22 +4993,15 @@ def error_inputs_flipud(op, device, **kwargs):
     yield ErrorInput(SampleInput(make_tensor((), dtype=torch.float, device=device)),
                      error_regex="Input must be >= 1-d.")
 
-# TODO: clamp shares tensors among its sample inputs --- we should prohibit this!
 def sample_inputs_clamp(op_info, device, dtype, requires_grad, **kwargs):
-    x = make_tensor((S, M, S), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
-    lb = make_tensor((S, M, S), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
-    ub = make_tensor((S, M, S), dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
+    make_arg = partial(make_tensor, dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
+    shape = (S, M, S)
 
-    def detach(tensor):
-        return tensor.clone().detach_().requires_grad_(requires_grad)
-
-    return [
-        SampleInput(detach(x), args=(lb, ub)),
-        SampleInput(detach(x), args=(detach(lb[0]), detach(ub[0]))),
-        SampleInput(detach(x), args=(detach(lb[:, :1]),)),
-        SampleInput(detach(x), args=(None, ub)),
-        SampleInput(detach(x), args=(lb, None)),
-    ]
+    yield SampleInput(make_arg(shape), args=(make_arg(shape), make_arg(shape)))
+    yield SampleInput(make_arg(shape), args=(make_arg(shape[1:]), make_arg(shape[1:])))
+    yield SampleInput(make_arg(shape), args=(make_arg((S, 1, S)),))
+    yield SampleInput(make_arg(shape), args=(None, make_arg(shape)))
+    yield SampleInput(make_arg(shape), args=(make_arg(shape), None))
 
 def reference_inputs_elementwise_ternary(op, device, dtype, requires_grad, *, sample_inputs_func, supports_scalars=False, **kwargs):
     yield from sample_inputs_func(op, device, dtype, requires_grad, **kwargs)
@@ -5809,28 +5895,24 @@ def _gather(shape, index_dim, max_indices):
     return [SampleInput(tensor, args=args) for tensor, args in test_cases]
 
 def sample_inputs_scatter_reduce(op_info, device, dtype, requires_grad, **kwargs):
-    def _tensor(shape, dtype=dtype, low=None, high=None):
-        return make_tensor(shape, dtype=dtype, device=device, low=low, high=high, requires_grad=requires_grad)
-
-    def _gather(shape, index_dim, max_indices):
-        return gather_variable(shape, index_dim, max_indices, device=device)
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    gather = partial(gather_variable, device=device)
 
     zero = torch.tensor(0, dtype=torch.long, device=device)
     test_cases = (
-        ((M, S), 0, _gather((S, S), 1, M), (S, S)),
-        ((M, S), 1, _gather((S, S), 0, S), (S, S)),
-        ((M, S), -1, _gather((S, S), 0, S), (S, S)),
-        ((M, S), 0, _gather((M, S // 2), 1, M), (M, S // 2)),
-        ((M, S), 1, _gather((M, S // 2), 0, S), (M, S // 2)),
-        ((M, S), -1, _gather((M, S // 2), 0, S), (M, S // 2)),
+        ((M, S), 0, gather((S, S), 1, M), (S, S)),
+        ((M, S), 1, gather((S, S), 0, S), (S, S)),
+        ((M, S), -1, gather((S, S), 0, S), (S, S)),
+        ((M, S), 0, gather((M, S // 2), 1, M), (M, S // 2)),
+        ((M, S), 1, gather((M, S // 2), 0, S), (M, S // 2)),
+        ((M, S), -1, gather((M, S // 2), 0, S), (M, S // 2)),
         ((), 0, zero.clone().detach(), ()),
     )
 
     reduce = op_info.variant_test_name
-    for args, include_self in product(test_cases, [True, False]):
-        inp_shape, dim, index, src_shape = args
-        yield SampleInput(_tensor(inp_shape),
-                          args=(dim, index, _tensor(src_shape), reduce),
+    for (inp_shape, dim, index, src_shape), include_self in product(test_cases, [False, True, False]):
+        yield SampleInput(make_arg(inp_shape),
+                          args=(dim, index, make_arg(src_shape), reduce),
                           kwargs={'include_self': include_self})
 
 
@@ -8029,6 +8111,31 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
            )),
+    OpInfo('uniform',
+           op=lambda inp, *args, **kwargs: wrapper_set_seed(torch.Tensor.uniform_, inp, *args, **kwargs),
+           method_variant=None,
+           inplace_variant=torch.Tensor.uniform_,
+           dtypes=floating_and_complex_types_and(torch.bfloat16, torch.float16),
+           supports_out=False,
+           supports_autograd=False,
+           is_factory_function=False,
+           sample_inputs_func=sample_inputs_uniform,
+           error_inputs_func=error_inputs_uniform,
+           skips=(
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # Tests that assume input tensor has a meningful effect on output tensor
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # aten.uniform_.default - couldn't find symbolic meta function/decomposition
+               DecorateInfo(unittest.expectedFailure, 'TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive'),
+               # aten.uniform was not decomposed
+               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_quick'),
+           )),
     BinaryUfuncInfo('clamp_max',
                     ref=_clamp_max_numpy,
                     dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
@@ -10164,6 +10271,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            assert_jit_shape_analysis=True,
            assert_autodiffed=True,
            supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            supports_out=True),
     OpInfo('softmax',
            aliases=('special.softmax', 'nn.functional.softmax',),
@@ -10173,6 +10281,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_func=partial(sample_inputs_softmax_variant, with_dtype=True),
            assert_autodiffed=True,
            supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            supports_out=True),
     # `softmin` supports different dtypes based on whether `dtype` argument,
     # is passed or not. Hence two OpInfo entries, one with dtype and other without.
@@ -10185,6 +10294,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            assert_jit_shape_analysis=False,
            assert_autodiffed=False,
            supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            supports_out=False),
     OpInfo('nn.functional.softmin',
            variant_test_name="with_dtype",
@@ -10193,6 +10303,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_func=partial(sample_inputs_softmax_variant, with_dtype=True),
            assert_autodiffed=False,
            supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            supports_out=False),
     OpInfo(
         "nn.functional.cross_entropy",
@@ -10201,6 +10312,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         sample_inputs_func=sample_inputs_cross_entropy,
         supports_out=False,
         supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
         decorators=(
             DecorateInfo(
                 toleranceOverride({torch.float32: tol(atol=1e-5, rtol=1e-3)}),
@@ -10292,6 +10404,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            assert_jit_shape_analysis=True,
+           supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_native_layer_norm,
            error_inputs_func=error_inputs_native_layer_norm,
            skips=(
@@ -10663,6 +10776,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            decorators=[
                # RuntimeError: Cannot insert a Tensor that requires grad as a constant.
                # Consider making it a parameter or input, or detaching the gradient
@@ -10681,6 +10795,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            assert_jit_shape_analysis=True,
            decorators=[
                DecorateInfo(
@@ -10735,7 +10850,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            dtypes=floating_and_complex_types(),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.half),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
            sample_inputs_func=partial(sample_inputs_nn_pad, mode='reflect'),
            skips=(
                # Doesn't have a corresponding aten operator.
@@ -10795,7 +10910,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            aten_name='im2col',
            aten_backward_name='im2col_backward',
            dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.half),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
            sample_inputs_func=sample_inputs_nn_unfold,
            # Runs very slowly on slow gradcheck - alternatively reduce input sizes
            gradcheck_fast_mode=True,
@@ -11720,6 +11835,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            assert_jit_shape_analysis=True,
            sample_inputs_func=sample_inputs_batch_norm,
            skips=(
@@ -11742,6 +11858,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
            decorators=[onlyCUDA, disablecuDNN],
            skips=(
                DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
@@ -12002,7 +12119,15 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           sample_inputs_func=sample_inputs_narrow),
+           sample_inputs_func=sample_inputs_narrow,
+           skips=(
+               # Use of .item()
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+           )),
     UnaryUfuncInfo('neg',
                    aliases=('negative', ),
                    ref=np.negative,
@@ -12962,16 +13087,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    supports_sparse_bsr=True,
                    supports_sparse_bsc=True,
                    supports_autograd=False),
-    OpInfo('eig',
-           op=torch.eig,
-           dtypes=floating_and_complex_types(),
-           sample_inputs_func=sample_inputs_eig,
-           error_inputs_func=error_inputs_eig,
-           decorators=[
-               skipCUDAIfNoMagma,
-               skipCPUIfNoLapack,
-           ],
-           ),
     OpInfo('einsum',
            # we need this lambda because SampleInput expects tensor input as the first argument
            # TODO(@heitorschueroff) update SampleInput to handle such cases
@@ -13390,6 +13505,17 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            # https://github.com/pytorch/pytorch/issues/66357
            check_batched_forward_grad=False,
            sample_inputs_func=sample_inputs_index,
+           skips=(
+               # boolean alpha not handled properly
+               DecorateInfo(unittest.expectedFailure,
+                            'TestCudaFuserOpInfo',
+                            'test_nvfuser_correctness',
+                            dtypes=(torch.bool,)),
+               DecorateInfo(unittest.expectedFailure,
+                            'TestNNCOpInfo',
+                            'test_nnc_correctness',
+                            dtypes=(torch.bool,)),
+           ),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL),
     OpInfo('index_reduce',
            dtypes=all_types_and(torch.float16, torch.bfloat16),
@@ -13974,6 +14100,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_bernoulli,
+           error_inputs_func=error_inputs_bernoulli,
            skips=(
                # vmap: We do not yet support calling random operations inside of vmap
                DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_forward_mode_AD'),
@@ -14092,10 +14219,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            )),
     OpInfo('cat',
            ref=_cat_np,
-           aliases=('concat',),
+           aliases=('concat', 'concatenate'),
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.complex32),
            sample_inputs_func=sample_inputs_cat_concat,
            reference_inputs_func=reference_inputs_cat,
+           error_inputs_func=error_inputs_cat,
            # https://github.com/pytorch/pytorch/issues/80411
            gradcheck_fast_mode=True,
            supports_forward_ad=True,
@@ -14704,6 +14832,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
         sample_inputs_func=sample_inputs_softmax_variant,
         supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
         assert_autodiffed=True),
     OpInfo(
         'log_softmax',
@@ -14713,6 +14842,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
         sample_inputs_func=partial(sample_inputs_softmax_variant, with_dtype=True),
         supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
         assert_autodiffed=True),
     UnaryUfuncInfo('logit',
                    aten_backward_name='logit_backward',
@@ -14774,7 +14904,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         supports_autograd=False,  # jiterator ops doesn't have backward defined
         decorators=[
             onlyCUDA,
-            skipCUDAIfRocm,
             DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
                          'TestUnaryUfuncs', 'test_reference_numerics_extremal'),
             DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
@@ -14799,6 +14928,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                          dtypes=[torch.bool]),
             DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
                          dtypes=[torch.bool]),
+            # ROCm generates -inf+infj instead of nan+infj for complex64 for some of the results
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                         dtypes=[torch.complex64], active_if=TEST_WITH_ROCM),
             # Expected failure: torch.jiterator_unary is not a valid op
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
             # Skip Nvfuser
@@ -14816,7 +14948,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         supports_out=False,
         supports_autograd=False,  # jiterator ops doesn't have backward defined
         supports_rhs_python_scalar=False,
-        decorators=[onlyCUDA, skipCUDAIfRocm],
+        decorators=[onlyCUDA],
         skips=(
             # Jiterator ops doesn't support neg or conj view
             DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
@@ -14841,7 +14973,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         sample_inputs_func=partial(sample_inputs_jiterator, num_inputs=4, alpha=3.14, beta=-4.20),
         supports_out=False,
         supports_autograd=False,  # jiterator ops doesn't have backward defined
-        decorators=[onlyCUDA, skipCUDAIfRocm],
+        decorators=[onlyCUDA],
         skips=(
             # Jiterator ops doesn't support neg or conj view
             DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
@@ -14861,7 +14993,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         op=torch.cuda.jiterator._create_multi_output_jit_fn(
             """
             template <typename T>
-            T binary_return_by_ref(T i0, T i1, T& out0) {
+            void binary_return_by_ref(T i0, T i1, T& out0) {
                 out0 = i0 + i1;
             }
             """,
@@ -14872,7 +15004,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         supports_out=False,
         supports_autograd=False,  # jiterator ops doesn't have backward defined
         supports_rhs_python_scalar=False,
-        decorators=[onlyCUDA, skipCUDAIfRocm],
+        decorators=[onlyCUDA],
         skips=(
             # Jiterator ops doesn't support neg or conj view
             DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
@@ -14892,7 +15024,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         op=torch.cuda.jiterator._create_multi_output_jit_fn(
             """
             template <typename T>
-            T binary_2outputs(T i0, T i1, T& out0, T& out1) {
+            void binary_2outputs(T i0, T i1, T& out0, T& out1) {
                 out0 = i0 + i1;
                 out1 = i0 - i1;
             }
@@ -14903,7 +15035,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         sample_inputs_func=partial(sample_inputs_jiterator, num_inputs=2),
         supports_out=False,
         supports_autograd=False,  # jiterator ops doesn't have backward defined
-        decorators=[onlyCUDA, skipCUDAIfRocm],
+        decorators=[onlyCUDA],
         skips=(
             # Jiterator ops doesn't support neg or conj view
             DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
@@ -15589,6 +15721,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         supports_out=False,
         sample_inputs_func=sample_inputs_nll_loss,
         supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
         assert_jit_shape_analysis=True,
         skips=(
             # RuntimeError:
@@ -15798,6 +15931,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         # complex not added to dtypes as complex gradients are not properly handled
         # and scatter_reduce hasn't been added to the whitelist in gen_variable_type yet
         dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
         sample_inputs_func=sample_inputs_scatter_reduce,
     ),
     OpInfo(
@@ -15811,6 +15946,10 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         skips=(
             # Pre-existing condition (calls .item); needs to be fixed
             DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+            # Not implemented
+            DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_forward_mode_AD'),
+            DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_inplace_forward_mode_AD'),
+            DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
         ),
     ),
     OpInfo(
@@ -15820,6 +15959,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         # and scatter_reduce hasn't been added to the whitelist in gen_variable_type yet
         dtypes=all_types_and(torch.float16, torch.bfloat16),
         dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
         sample_inputs_func=sample_inputs_scatter_reduce,
     ),
     OpInfo(
@@ -15827,6 +15968,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         variant_test_name='amin',
         dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
         dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
+        supports_forward_ad=True,
+        check_batched_forward_grad=False,
+        supports_fwgrad_bwgrad=True,
         sample_inputs_func=sample_inputs_scatter_reduce,
     ),
     OpInfo(
@@ -15834,6 +15978,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         variant_test_name='amax',
         dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
         dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
+        supports_forward_ad=True,
+        check_batched_forward_grad=False,
+        supports_fwgrad_bwgrad=True,
         sample_inputs_func=sample_inputs_scatter_reduce,
     ),
     OpInfo(
@@ -16236,6 +16383,13 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.sign",
         torch_opinfo_name="sign",
     ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.sgn",
+        torch_opinfo_name="sgn",
+        # This is an issue with the vectorised abs on CPU
+        handles_complex_extremal_values=False,
+        handles_large_floats=False,
+    ),
     ElementwiseUnaryPythonRefInfo(
         "_refs.signbit",
         torch_opinfo_name="signbit",
@@ -16245,6 +16399,10 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.sin",
         torch_opinfo_name="sin",
     ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.sinc",
+        torch_opinfo_name="sinc",
+    ),
     ElementwiseUnaryPythonRefInfo(
         "_refs.sinh",
         torch_opinfo_name="sinh",
@@ -16880,17 +17038,19 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     PythonRefInfo(
         "_refs.broadcast_tensors",
         torch_opinfo_name="broadcast_tensors",
-        validate_view_consistency=False,
     ),
     PythonRefInfo(
         "_refs.broadcast_to",
         torch_opinfo_name="broadcast_to",
-        validate_view_consistency=False,
     ),
     PythonRefInfo(
         "_refs.cat",
         torch_opinfo_name="cat",
         supports_nvfuser=False,
+        skips=(
+            # FIXME: AssertionError: RuntimeError not raised
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
     ),
     PythonRefInfo(
         "_refs.chunk",
@@ -16942,6 +17102,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         torch_opinfo_name="expand",
         supports_nvfuser=False,
     ),
+    PythonRefInfo(
+        "_refs.expand_as",
+        torch_opinfo_name="expand_as",
+        supports_nvfuser=False,
+    ),
     PythonRefInfo(
         "_refs.flatten",
         torch_opinfo_name="flatten",
@@ -16975,6 +17140,10 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.narrow",
         torch_opinfo_name="narrow",
         supports_nvfuser=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_meta'),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_executor'),
+        )
     ),
     PythonRefInfo(
         "_refs.native_layer_norm",
@@ -16990,6 +17159,12 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         torch_opinfo_name="ravel",
         supports_nvfuser=False,
     ),
+    PythonRefInfo(
+        "_refs.repeat",
+        torch_opinfo_name="repeat",
+        supports_nvfuser=False,
+        validate_view_consistency=False,
+    ),
     PythonRefInfo(
         "_refs.reshape",
         torch_opinfo_name="reshape",
@@ -17021,7 +17196,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     PythonRefInfo(
         "_refs.squeeze",
         torch_opinfo_name="squeeze",
-        validate_view_consistency=False,
     ),
     PythonRefInfo(
         "_refs.tensor_split",
@@ -17054,10 +17228,14 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         torch_opinfo_name="t",
         supports_nvfuser=False,
     ),
+    PythonRefInfo(
+        "_refs.unfold_copy",
+        torch_opinfo_name="unfold",
+        supports_nvfuser=False,
+    ),
     PythonRefInfo(
         "_refs.unsqueeze",
         torch_opinfo_name="unsqueeze",
-        validate_view_consistency=False,
     ),
     PythonRefInfo(
         "_refs.view",
@@ -17121,7 +17299,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     PythonRefInfo(
         "_refs.std_mean",
         torch_opinfo_name="std_mean",
-        validate_view_consistency=False,
     ),
     ReductionPythonRefInfo(
         "_refs.sum",
@@ -17342,6 +17519,45 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         op=lambda self, condition, other: refs.where(condition, self, other),
         supports_nvfuser=False,
     ),
+    PythonRefInfo(
+        "_refs.index_select",
+        torch_opinfo_name="index_select",
+        # empty_strided
+        supports_nvfuser=False,
+        skips=(
+            # no _refs support for Tensor.__setitem__
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref'),
+            # Sample out= with a stride of zero. This _out operation checks that the input has no
+            # inner overlap
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),)
+    ),
+    PythonRefInfo(
+        "_refs.index_copy",
+        torch_opinfo_name="index_copy",
+        # empty_strided
+        supports_nvfuser=False,
+        skips=(
+            # no _refs support for Tensor.__setitem__
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref'),)
+    ),
+    PythonRefInfo(
+        "_refs.index_add",
+        torch_opinfo_name="index_add",
+        # empty_strided
+        supports_nvfuser=False,
+        skips=(
+            # no _refs support for Tensor.__setitem__
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref'),)
+    ),
+    PythonRefInfo(
+        "_refs.index_fill",
+        torch_opinfo_name="index_fill",
+        # empty_strided
+        supports_nvfuser=False,
+        skips=(
+            # no _refs support for Tensor.__setitem__
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref'),)
+    ),
     #
     # Test-related functions
     #
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 6dad04c01e64..e5f6c53ba289 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -920,13 +920,13 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
         return r
 
 # Run PyTorch tests with TorchDynamo
-TEST_WITH_TORCHDYNAMO = os.getenv('PYTORCH_TEST_WITH_DYNAMO') == '1'
+TEST_WITH_TORCHINDUCTOR = os.getenv('PYTORCH_TEST_WITH_INDUCTOR') == '1'
+TEST_WITH_TORCHDYNAMO = os.getenv('PYTORCH_TEST_WITH_DYNAMO') == '1' or TEST_WITH_TORCHINDUCTOR
+
 if TEST_WITH_TORCHDYNAMO:
     import torchdynamo
     import logging
     torchdynamo.config.log_level = logging.ERROR
-    # TODO - Collect errors with fake tensors
-    torchdynamo.config.fake_tensor_propagation = True
     # Do not spend time on helper functions that are called with different inputs
     torchdynamo.config.cache_size_limit = 8
 
@@ -952,6 +952,27 @@ def wrapper(*args, **kwargs):
 
     return decorator
 
+def skipIfTorchInductor(msg="test doesn't currently work with torchinductor"):
+    def decorator(fn):
+        if not isinstance(fn, type):
+            @wraps(fn)
+            def wrapper(*args, **kwargs):
+                if TEST_WITH_TORCHINDUCTOR:
+                    raise unittest.SkipTest(msg)
+                else:
+                    fn(*args, **kwargs)
+            return wrapper
+
+        assert(isinstance(fn, type))
+        if TEST_WITH_TORCHINDUCTOR:
+            fn.__unittest_skip__ = True
+            fn.__unittest_skip_why__ = msg
+
+        return fn
+
+    return decorator
+
+
 # Determine whether to enable cuda memory leak check.
 # CUDA mem leak check is expensive and thus we don't want to execute it on every
 # test case / configuration.
@@ -1970,10 +1991,13 @@ def _run_with_retry(self, result=None, num_runs_left=0, report_only=True, num_re
             failures_before = 0 if result is None else len(result.failures)  # num tests marked as failed before starting
             errors_before = 0 if result is None else len(result.errors)  # num tests marked as errored before starting
 
-
         if TEST_WITH_TORCHDYNAMO:
-            with torchdynamo.optimize("eager"):
-                super().run(result=result)
+            # TorchDynamo optimize annotation
+            if TEST_WITH_TORCHINDUCTOR:
+                super_run = torchdynamo.optimize("inductor")(super().run)
+            else:
+                super_run = torchdynamo.optimize("eager")(super().run)
+            super_run(result=result)
 
             # TODO - Reset for each test slows down testing significantly.
             # torchdynamo.reset()
@@ -2673,6 +2697,62 @@ def assertAtenOp(self, onnx_model, operator, overload_name=""):
         self.assertEqual(attrs["operator"], operator)
         self.assertEqual(attrs.get("overload_name", ""), overload_name)
 
+    def check_nondeterministic_alert(self, fn, caller_name, should_alert=True):
+        '''Checks that an operation produces a nondeterministic alert when
+        expected while `torch.use_deterministic_algorithms(True)` is set.
+
+        Args:
+          fn (callable): Function to check for a nondeterministic alert
+
+          caller_name (str): Name of the operation that produces the
+              nondeterministic alert. This name is expected to appear at the
+              beginning of the error/warning message.
+
+          should_alert (bool, optional): If True, then the check will only pass
+              if calling `fn` produces a nondeterministic error/warning with the
+              expected message. If False, then the check will only pass if
+              calling `fn` does not produce an error. Default: `True`.
+        '''
+
+        alert_message = '^' + caller_name + ' does not have a deterministic implementation, but you set'
+
+        # Check that errors are thrown correctly
+        with DeterministicGuard(True):
+            if should_alert:
+                with self.assertRaisesRegex(
+                        RuntimeError,
+                        alert_message,
+                        msg='expected a non-deterministic error, but it was not raised'):
+                    fn()
+
+            else:
+                # If a nondeterministic error is not expected, make sure
+                # that it is not raised
+                try:
+                    fn()
+                except RuntimeError as e:
+                    if 'does not have a deterministic implementation' in str(e):
+                        self.fail(
+                            'did not expect non-deterministic error message, '
+                            + 'but got one anyway: "' + str(e) + '"')
+                    # Reraise exceptions unrelated to nondeterminism
+                    raise
+
+        # Check that warnings are thrown correctly
+        with DeterministicGuard(True, warn_only=True):
+            if should_alert:
+                with self.assertWarnsRegex(
+                        UserWarning,
+                        alert_message):
+                    fn()
+            else:
+                with warnings.catch_warnings(record=True) as w:
+                    warnings.simplefilter("always")
+                    fn()
+                    for warning in w:
+                        if isinstance(warning, UserWarning):
+                            self.assertTrue(re.search(alert_message, str(warning)) is None)
+
     # run code in subprocess and capture exceptions.
     @staticmethod
     def run_process_no_exception(code, env=None):
diff --git a/torch/testing/_internal/composite_compliance.py b/torch/testing/_internal/composite_compliance.py
index c14595c69c59..dc423eb0e687 100644
--- a/torch/testing/_internal/composite_compliance.py
+++ b/torch/testing/_internal/composite_compliance.py
@@ -1,13 +1,11 @@
 import torch
 from torch import Tensor
-import contextlib
 import itertools
 from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten
 from functools import partial
 from torch.utils._mode_utils import no_dispatch
 from torch.utils._python_dispatch import enable_torch_dispatch_mode
 import torch.autograd.forward_ad as fwAD
-from torch.overrides import enable_reentrant_dispatch
 from typing import Callable
 import re
 
@@ -101,21 +99,10 @@ def is_inplace(func):
     return name[-1] == '_'
 
 
-def generate_cct(enable_recursive_torch_dispatch=False,
-                 autograd_view_consistency=True):
+def generate_cct(autograd_view_consistency=True):
     # This function returns a new class CompositeCompliantTensor
     # The two arguments control the behaviour described below.
 
-    # enable_recursive_torch_dispatch:
-    #   If True, enable __torch_dispatch__ before calling the func in
-    #   CCT's __torch_dispatch__ implementation else call
-    #   the func under `no_dispatch`.
-    #   NOTE: We need to disable dispatch under Torch Dispatch Mode,
-    #   to avoid infinite recursion.
-    #   Also, we need to enable dispatch for checking
-    #   forward_AD composite compliance
-    #   Refer: https://github.com/pytorch/pytorch/issues/75652
-
     # autograd_view_consistency:
     #   If True, alias result using `set_` if func returns a view
     #   (See Note [Alias Result]).
@@ -197,12 +184,10 @@ def wrap(e):
                         'regular Tensor but the other tensors are Tensor Subclasses. '
                         'Please try to avoid this in-place operation.')
 
-            with enable_reentrant_dispatch():
-                with contextlib.nullcontext() if enable_recursive_torch_dispatch else no_dispatch():
-                    unwrapped_args = tree_map(unwrap, args)
-                    unwrapped_kwargs = tree_map(unwrap, kwargs)
-                    unwrapped_rs = func(*unwrapped_args, **unwrapped_kwargs)
-                    rs = tree_map(wrap, unwrapped_rs)
+            unwrapped_args = tree_map(unwrap, args)
+            unwrapped_kwargs = tree_map(unwrap, kwargs)
+            unwrapped_rs = func(*unwrapped_args, **unwrapped_kwargs)
+            rs = tree_map(wrap, unwrapped_rs)
 
             if is_view_fn(func) and autograd_view_consistency:
                 # Note [Alias Result]
@@ -210,25 +195,24 @@ def wrap(e):
                 # are the same. Here we try to make B alias A to avoid those asserts.
                 # See https://github.com/pytorch/pytorch/issues/65339 for more information
                 # about the issue.
-                with enable_reentrant_dispatch():
-                    with no_dispatch():
-                        # Idea: this is a weird way of getting a storage that aliases the input.
-                        # This is a workaround for #65339.
-                        # 1. under no_dispatch, all of the wrapper tensors look like regular
-                        #    tensors with special storage (the storage is nullptr and
-                        #    advertises CPU/CUDA device.
-                        # 2. we run func, which ends up running the view operation
-                        # 3. All view operations reuse the input's storage and return
-                        #    result Tensor(s) with new sizes/strides/offset that alias
-                        #    the input.
-                        # 4. we set the storage (and sizes/strides/offset) of the wrapper
-                        #    tensor results to be that of the tensors that alias the input
-                        result = func(*args, **kwargs)
-                        if isinstance(result, tuple) or isinstance(result, list):
-                            for a, b in zip(rs, result):
-                                a.set_(b)
-                        else:
-                            rs.set_(result)
+                with no_dispatch():
+                    # Idea: this is a weird way of getting a storage that aliases the input.
+                    # This is a workaround for #65339.
+                    # 1. under no_dispatch, all of the wrapper tensors look like regular
+                    #    tensors with special storage (the storage is nullptr and
+                    #    advertises CPU/CUDA device.
+                    # 2. we run func, which ends up running the view operation
+                    # 3. All view operations reuse the input's storage and return
+                    #    result Tensor(s) with new sizes/strides/offset that alias
+                    #    the input.
+                    # 4. we set the storage (and sizes/strides/offset) of the wrapper
+                    #    tensor results to be that of the tensors that alias the input
+                    result = func(*args, **kwargs)
+                    if isinstance(result, tuple) or isinstance(result, list):
+                        for a, b in zip(rs, result):
+                            a.set_(b)
+                    else:
+                        rs.set_(result)
 
             # Some operations are allowed to in-place modify the metadata of the
             # inputs. The only ones are the "inplace view functions"; when we
@@ -502,7 +486,7 @@ def unwrap(e):
 # this means we can apply check_forward_ad_formula to things that aren't OpInfos
 # while debugging.
 def check_forward_ad_formula(op: Callable, args, kwargs, gradcheck_wrapper=None, assert_equal_fn=None):
-    CCT = generate_cct(enable_recursive_torch_dispatch=True, autograd_view_consistency=False)
+    CCT = generate_cct(autograd_view_consistency=False)
 
     def maybe_tangent(t):
         assert type(t) is not CCT
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index ce84966e12b2..7f30129ce46b 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -143,6 +143,7 @@ def eq(value, other):
     dist.Backend.NCCL,
     dist.Backend.GLOO,
     dist.Backend.MPI,
+    # TODO(ucc): dist.Backend.UCC,
 ]
 
 # Allowlist of distributed backends where profiling is supported with use_cuda=True
@@ -150,6 +151,7 @@ def eq(value, other):
     dist.Backend.GLOO,
     dist.Backend.MPI,
     dist.Backend.NCCL,
+    # TODO(ucc): dist.Backend.UCC,
 ]
 
 # Allowlist of distributed backends where profiling is supported for p2p ops
@@ -157,6 +159,7 @@ def eq(value, other):
     dist.Backend.MPI,
     dist.Backend.GLOO,
     dist.Backend.NCCL,
+    # TODO(ucc): dist.Backend.UCC,
 ]
 
 # Dummy NamedTuple data structures to test DDP support for NamedTuple types.
@@ -395,6 +398,8 @@ def check(backend):
             return dist.is_nccl_available()
         if backend == dist.Backend.MPI:
             return dist.is_mpi_available()
+        if backend == dist.Backend.UCC:
+            return dist.is_ucc_available()
         if backend in DistTestCases.backend_feature["plugin"]:
             return True
         return False
@@ -2913,6 +2918,7 @@ def _test_scatter_helper(
             self._barrier()
 
         @sandcastle_skip_if(BACKEND == "nccl", "Nccl does not support CPU tensors")
+        @sandcastle_skip_if(BACKEND == "ucc", "CPU tensor ops not supported by UCP TL")
         def test_scatter_checks(self):
             group, group_id, rank = self._init_global_test()
             one = torch.ones([1])
@@ -2936,6 +2942,7 @@ def test_scatter_checks(self):
             self.assertEqual(output, one * rank)
 
         @sandcastle_skip_if(BACKEND == "nccl", "Nccl does not support CPU tensors")
+        @sandcastle_skip_if(BACKEND == "ucc", "CPU tensor ops not supported by UCP TL")
         def test_scatter(self):
             group, group_id, rank = self._init_global_test()
             self._test_scatter_helper(group, group_id, rank)
@@ -2948,6 +2955,7 @@ def test_scatter_cuda(self):
             self._test_scatter_helper(group, group_id, rank, True, rank_to_GPU)
 
         @sandcastle_skip_if(BACKEND == "nccl", "Nccl does not support CPU tensors")
+        @sandcastle_skip_if(BACKEND == "ucc", "CPU tensor ops not supported by UCP TL")
         def test_scatter_complex(self):
             group, group_id, rank = self._init_global_test()
             self._test_scatter_helper(group, group_id, rank, dtype=torch.cfloat)
@@ -2960,12 +2968,14 @@ def test_scatter_cuda_complex(self):
             self._test_scatter_helper(group, group_id, rank, True, rank_to_GPU, dtype=torch.cfloat)
 
         @sandcastle_skip_if(BACKEND == "nccl", "Nccl does not support CPU tensors")
+        @sandcastle_skip_if(BACKEND == "ucc", "CPU tensor ops not supported by UCP TL")
         @skip_if_small_worldsize
         def test_scatter_group(self):
             group, group_id, rank = self._init_group_test()
             self._test_scatter_helper(group, group_id, rank)
 
         @sandcastle_skip_if(BACKEND == "nccl", "Nccl does not support CPU tensors")
+        @sandcastle_skip_if(BACKEND == "ucc", "CPU tensor ops not supported by UCP TL")
         def test_scatter_full_group(self):
             group, group_id, rank = self._init_full_group_test()
             self._test_scatter_helper(group, group_id, rank)
@@ -2999,6 +3009,7 @@ def _test_gather_helper(self, group, group_id, rank, cuda=False, rank_to_GPU=Non
             self._barrier()
 
         @sandcastle_skip_if(BACKEND == "nccl", "Nccl does not support CPU tensors")
+        @sandcastle_skip_if(BACKEND == "ucc", "CPU tensor ops not supported by UCP TL")
         def test_gather_checks(self):
             group, group_id, rank = self._init_global_test()
             one = torch.ones([1])
@@ -3022,6 +3033,7 @@ def test_gather_checks(self):
                 dist.gather(one * rank)
 
         @sandcastle_skip_if(BACKEND == "nccl", "Nccl does not support CPU tensors")
+        @sandcastle_skip_if(BACKEND == "ucc", "CPU tensor ops not supported by UCP TL")
         def test_gather(self):
             group, group_id, rank = self._init_global_test()
             self._test_gather_helper(group, group_id, rank)
@@ -3034,12 +3046,14 @@ def test_gather_cuda(self):
             self._test_gather_helper(group, group_id, rank, True, rank_to_GPU)
 
         @sandcastle_skip_if(BACKEND == "nccl", "Nccl does not support CPU tensors")
+        @sandcastle_skip_if(BACKEND == "ucc", "CPU tensor ops not supported by UCP TL")
         @skip_if_small_worldsize
         def test_gather_group(self):
             group, group_id, rank = self._init_group_test()
             self._test_gather_helper(group, group_id, rank)
 
         @sandcastle_skip_if(BACKEND == "nccl", "Nccl does not support CPU tensors")
+        @sandcastle_skip_if(BACKEND == "ucc", "CPU tensor ops not supported by UCP TL")
         def test_gather_full_group(self):
             group, group_id, rank = self._init_full_group_test()
             self._test_gather_helper(group, group_id, rank)
@@ -3611,6 +3625,7 @@ def _test_barrier_helper(
 
         @skip_if_no_gpu
         @sandcastle_skip_if(BACKEND == "mpi", "MPI doesn't supports GPU barrier")
+        @sandcastle_skip_if(BACKEND == "ucc", "flaky on PyTorch CI with timeout")
         def test_barrier_cuda(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
@@ -4336,7 +4351,7 @@ def _test_ddp_hook_with_optimizer_parity(
                     dist.barrier()
 
         @sandcastle_skip_if(
-            BACKEND == "nccl",
+            BACKEND == "nccl" or BACKEND == "ucc",
             "Issues with async error handling, see https://github.com/pytorch/pytorch/issues/73259"
         )
         @skip_if_lt_x_gpu(2)
@@ -4363,7 +4378,7 @@ def test_ddp_hook_with_optimizer_parity_adamw(
             )
 
         @sandcastle_skip_if(
-            BACKEND == "nccl",
+            BACKEND == "nccl" or BACKEND == "ucc",
             "Issues with async error handling, see https://github.com/pytorch/pytorch/issues/73259"
         )
         @skip_if_lt_x_gpu(2)
@@ -4383,7 +4398,7 @@ def test_ddp_hook_with_optimizer_parity_adam(self, optimize_subset):
             )
 
         @sandcastle_skip_if(
-            BACKEND == "nccl",
+            BACKEND == "nccl" or BACKEND == "ucc",
             "Issues with async error handling, see https://github.com/pytorch/pytorch/issues/73259"
         )
         @skip_if_lt_x_gpu(2)
@@ -5093,6 +5108,7 @@ def _test_post_localSGD_optimizer_step_reload(self, create_averager):
 
             self.assertEqual(averager2.step, 0)
 
+            dist.barrier()
             if self.rank == 0:
                 os.remove(chkpt_file)
 
@@ -9011,6 +9027,7 @@ def _test_hook_pickling(self, hook, hook_state):
             for orig_param, dummy_param in zip(ddp_model.parameters(), dummy_ddp_model.parameters()):
                 self.assertEqual(orig_param.grad, dummy_param.grad)
 
+            dist.barrier()
             if rank == 0:
                 os.remove(chkpt_file)
 
@@ -9018,6 +9035,10 @@ def _test_hook_pickling(self, hook, hook_state):
             BACKEND not in DistTestCases.backend_feature["cuda"],
             f"The {BACKEND} backend does not support DDP communication hook on CUDA devices"
         )
+        @sandcastle_skip_if(
+            BACKEND == "ucc",
+            "flaky on PyTorch CI: No such file or directory: '/tmp/checkpoint.pt'"
+        )
         @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
         def test_ddp_hook_pickling_powerSGD(self):
 
diff --git a/torch/testing/_internal/opinfo/definitions/_masked.py b/torch/testing/_internal/opinfo/definitions/_masked.py
index d8a3e8aa948d..cb88766e70c6 100644
--- a/torch/testing/_internal/opinfo/definitions/_masked.py
+++ b/torch/testing/_internal/opinfo/definitions/_masked.py
@@ -990,6 +990,7 @@ def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwar
         ),
         gradcheck_wrapper=gradcheck_wrapper_masked_operation,
         supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
         supports_out=False,
     ),
     OpInfo(
@@ -1017,6 +1018,7 @@ def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwar
         ],
         gradcheck_wrapper=gradcheck_wrapper_masked_operation,
         supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
         supports_out=False,
     ),
     OpInfo(
@@ -1037,6 +1039,7 @@ def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwar
         ),
         gradcheck_wrapper=gradcheck_wrapper_masked_operation,
         supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
         supports_out=False,
     ),
     OpInfo(
diff --git a/torch/testing/_internal/opinfo/definitions/linalg.py b/torch/testing/_internal/opinfo/definitions/linalg.py
index 8f25a4c6ea33..6f7a45fdbee1 100644
--- a/torch/testing/_internal/opinfo/definitions/linalg.py
+++ b/torch/testing/_internal/opinfo/definitions/linalg.py
@@ -525,7 +525,11 @@ def sample_inputs_linalg_norm(
     ]
 
     vector_ords = (None, 0, 0.5, 1, 2, 3.5, inf, -0.5, -1, -2, -3.5, -inf)
-    matrix_ords = (None, "fro", "nuc", 1, 2, inf, -1, -2, -inf)
+    if dtype in {torch.float16, torch.bfloat16, torch.complex32}:
+        # svdvals not supported for low precision dtypes
+        matrix_ords = ("fro", inf, -inf, 1, -1)
+    else:
+        matrix_ords = (None, "fro", "nuc", inf, -inf, 1, -1, 2, -2)
 
     inputs = []
 
@@ -533,8 +537,11 @@ def sample_inputs_linalg_norm(
         is_vector_norm = len(test_size) == 1
         is_matrix_norm = len(test_size) == 2
 
+        # IndexError: amax(): Expected reduction dim 0 to have non-zero size.
+        is_valid_for_p2 = is_vector_norm or (test_size[-1] != 0 and test_size[-2] != 0)
+
         for keepdim in [False, True]:
-            if not variant == "subgradient_at_zero":
+            if variant != "subgradient_at_zero" and is_valid_for_p2:
                 inputs.append(
                     SampleInput(
                         make_tensor(
@@ -555,6 +562,28 @@ def sample_inputs_linalg_norm(
             ords = vector_ords if is_vector_norm else matrix_ords
 
             for ord in ords:
+                if is_vector_norm and test_size[-1] == 0:
+                    if ord == np.inf or (ord is not None and ord < 0):
+                        # RuntimeError: linalg.vector_norm cannot compute the
+                        # {ord} norm on an empty tensor because the operation
+                        # does not have an identity
+                        continue
+                elif is_matrix_norm:
+                    dims_to_check = {
+                        None: (0,),
+                        np.inf: (0,),
+                        2: (0, 1),
+                        1: (1,),
+                        -1: (1,),
+                        -2: (0, 1),
+                        -np.inf: (0,),
+                    }.get(ord, ())
+
+                    if any(test_size[d] == 0 for d in dims_to_check):
+                        # IndexError: amax(): Expected reduction dim {dim} to
+                        # have non-zero size.
+                        continue
+
                 if variant == "subgradient_at_zero":
                     inputs.append(
                         SampleInput(
@@ -599,7 +628,7 @@ def sample_inputs_linalg_norm(
                             )
                         )
 
-        return inputs
+    return inputs
 
 
 def sample_inputs_linalg_vecdot(op_info, device, dtype, requires_grad, **kwargs):
@@ -1724,6 +1753,9 @@ def make_input():
         supports_forward_ad=True,
         check_batched_forward_grad=False,
         supports_fwgrad_bwgrad=True,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestGradients", "test_fn_gradgrad"),
+        ),
     ),
     OpInfo(
         "linalg.norm",
@@ -1747,6 +1779,10 @@ def make_input():
             DecorateInfo(
                 unittest.expectedFailure, "TestGradients", "test_fn_fwgrad_bwgrad"
             ),
+            DecorateInfo(
+                unittest.expectedFailure, "TestGradients", "test_forward_mode_AD"
+            ),
+            DecorateInfo(unittest.expectedFailure, "TestGradients", "test_fn_grad"),
         ),
     ),
     OpInfo(
diff --git a/torch/testing/_internal/opinfo/definitions/special.py b/torch/testing/_internal/opinfo/definitions/special.py
index 61059bae3a0e..2fa63fce52b4 100644
--- a/torch/testing/_internal/opinfo/definitions/special.py
+++ b/torch/testing/_internal/opinfo/definitions/special.py
@@ -40,32 +40,24 @@
 #       supports `exclude` argument.
 #       For more context: https://github.com/pytorch/pytorch/pull/56352#discussion_r633277617
 def sample_inputs_i0_i1(op_info, device, dtype, requires_grad, **kwargs):
-
-    samples = (
-        SampleInput(
-            make_tensor((S,), dtype=dtype, device=device, requires_grad=requires_grad)
-        ),
-        SampleInput(
-            make_tensor((), dtype=dtype, device=device, requires_grad=requires_grad)
-        ),
+    exclude_zero = requires_grad and op_info.op == torch.special.i0e
+    make_arg = partial(
+        make_tensor,
+        dtype=dtype,
+        device=device,
+        requires_grad=requires_grad,
+        exclude_zero=exclude_zero,
     )
+    yield SampleInput(make_arg((S,)))
+    yield SampleInput(make_arg(()))
 
-    if requires_grad and op_info.op == torch.special.i0e:
-        # NOTE: `i0e`'s first-order gradient is not continous
-        # at `0`, hence we don't test `i0e` with any input being `0`.
-        # TODO: Remove this when `make_tensor` supports excluding `0`.
-        for sample in samples:
-            t = sample.input
-            t[t == 0] = torch.finfo(dtype).eps  # type: ignore[index]
-    elif requires_grad and op_info.op != torch.special.i0e:
+    if requires_grad and not exclude_zero:
         # Special Case for gradient
         # Sample with `0` in the input
-        t = make_tensor((S,), dtype=dtype, device=device, requires_grad=requires_grad)
+        t = make_arg((S,))
         t[0] = 0
 
-        samples += (SampleInput(t),)  # type: ignore[assignment]
-
-    return samples
+        yield SampleInput(t)
 
 
 def sample_inputs_polygamma(op_info, device, dtype, requires_grad, **kwargs):
diff --git a/torch/types.py b/torch/types.py
index 80f7278ef488..989ff89a8701 100644
--- a/torch/types.py
+++ b/torch/types.py
@@ -21,6 +21,7 @@
 _qscheme = torch.qscheme
 _size = Union[torch.Size, List[_int], Tuple[_int, ...]]
 _layout = torch.layout
+_dispatchkey = Union[str, torch._C.DispatchKey]
 
 class SymInt:
     pass
diff --git a/torch/utils/_cuda_trace.py b/torch/utils/_cuda_trace.py
index 11f0308bb4db..bc62145d683d 100644
--- a/torch/utils/_cuda_trace.py
+++ b/torch/utils/_cuda_trace.py
@@ -46,6 +46,15 @@ def fire_callbacks(self, *args: P.args, **kwargs: P.kwargs) -> None:
 CUDAStreamCreationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
     "CUDA stream creation"
 )
+CUDADeviceSynchronizationCallbacks: "CallbackRegistry[[]]" = CallbackRegistry(
+    "CUDA device synchronization"
+)
+CUDAStreamSynchronizationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "CUDA stream synchronization"
+)
+CUDAEventSynchronizationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "CUDA event synchronization"
+)
 
 
 def register_callback_for_cuda_event_creation(cb: Callable[[int], None]) -> None:
@@ -74,3 +83,17 @@ def register_callback_for_cuda_memory_deallocation(cb: Callable[[int], None]) ->
 
 def register_callback_for_cuda_stream_creation(cb: Callable[[int], None]) -> None:
     CUDAStreamCreationCallbacks.add_callback(cb)
+
+
+def register_callback_for_cuda_device_synchronization(cb: Callable[[], None]) -> None:
+    CUDADeviceSynchronizationCallbacks.add_callback(cb)
+
+
+def register_callback_for_cuda_stream_synchronization(
+    cb: Callable[[int], None]
+) -> None:
+    CUDAStreamSynchronizationCallbacks.add_callback(cb)
+
+
+def register_callback_for_cuda_event_synchronization(cb: Callable[[int], None]) -> None:
+    CUDAEventSynchronizationCallbacks.add_callback(cb)
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
index 7260c24cf282..477e4704c458 100644
--- a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
+++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
@@ -820,10 +820,10 @@ def check_result(completed_process):
             # =============================================================================
             # == Check that subprocess matches parent =====================================
             # =============================================================================
-            if sys.executable != "{parent_interpreter}":
+            if os.path.realpath(sys.executable) != "{parent_interpreter}":
                 log_failure(
                     "Interpreter mismatch:\n"
-                    f"  {{sys.executable}}\n    vs.\n  {parent_interpreter}"
+                    f"  {{os.path.realpath(sys.executable)}}\n    vs.\n  {parent_interpreter}"
                 )
 
             if torch.__file__ != "{torch_file}":
@@ -888,7 +888,7 @@ def check_result(completed_process):
             num_threads=task_spec.num_threads,
             error_log_repr=repr(error_log),
             stat_log=stat_log,
-            parent_interpreter=sys.executable,
+            parent_interpreter=os.path.realpath(sys.executable),
             torch_file=torch.__file__,
             bindings_import=(
                 "import torch._C as callgrind_bindings" if bindings is None
diff --git a/torch/utils/data/_utils/fetch.py b/torch/utils/data/_utils/fetch.py
index 10109bfc7af2..0262c078ca98 100644
--- a/torch/utils/data/_utils/fetch.py
+++ b/torch/utils/data/_utils/fetch.py
@@ -17,7 +17,9 @@ def fetch(self, possibly_batched_index):
 
 class _IterableDatasetFetcher(_BaseDatasetFetcher):
     def __init__(self, dataset, auto_collation, collate_fn, drop_last):
-        super(_IterableDatasetFetcher, self).__init__(dataset, auto_collation, collate_fn, drop_last)
+        super(_IterableDatasetFetcher, self).__init__(
+            dataset, auto_collation, collate_fn, drop_last
+        )
         self.dataset_iter = iter(dataset)
         self.ended = False
 
@@ -33,7 +35,9 @@ def fetch(self, possibly_batched_index):
                 except StopIteration:
                     self.ended = True
                     break
-            if len(data) == 0 or (self.drop_last and len(data) < len(possibly_batched_index)):
+            if len(data) == 0 or (
+                self.drop_last and len(data) < len(possibly_batched_index)
+            ):
                 raise StopIteration
         else:
             data = next(self.dataset_iter)
@@ -42,11 +46,16 @@ def fetch(self, possibly_batched_index):
 
 class _MapDatasetFetcher(_BaseDatasetFetcher):
     def __init__(self, dataset, auto_collation, collate_fn, drop_last):
-        super(_MapDatasetFetcher, self).__init__(dataset, auto_collation, collate_fn, drop_last)
+        super(_MapDatasetFetcher, self).__init__(
+            dataset, auto_collation, collate_fn, drop_last
+        )
 
     def fetch(self, possibly_batched_index):
         if self.auto_collation:
-            data = [self.dataset[idx] for idx in possibly_batched_index]
+            if hasattr(self.dataset, "__getitems__") and self.dataset.__getitems__:
+                data = self.dataset.__getitems__(possibly_batched_index)
+            else:
+                data = [self.dataset[idx] for idx in possibly_batched_index]
         else:
             data = self.dataset[possibly_batched_index]
         return self.collate_fn(data)
diff --git a/torch/utils/data/_utils/worker.py b/torch/utils/data/_utils/worker.py
index ee94bc08ac1c..f4fa61a70f8d 100644
--- a/torch/utils/data/_utils/worker.py
+++ b/torch/utils/data/_utils/worker.py
@@ -223,13 +223,13 @@ def _worker_loop(dataset_kind, dataset, index_queue, data_queue, done_event,
             np.random.seed(np_seed)
 
         from torch.utils.data import IterDataPipe
-        from torch.utils.data.graph_settings import apply_shuffle_seed
+        from torch.utils.data.graph_settings import apply_random_seed
 
         shared_rng = torch.Generator()
         if isinstance(dataset, IterDataPipe):
             assert shared_seed is not None
             shared_rng.manual_seed(shared_seed)
-            dataset = apply_shuffle_seed(dataset, shared_rng)
+            dataset = apply_random_seed(dataset, shared_rng)
 
         global _worker_info
         _worker_info = WorkerInfo(id=worker_id, num_workers=num_workers,
@@ -277,7 +277,7 @@ def _worker_loop(dataset_kind, dataset, index_queue, data_queue, done_event,
                 if isinstance(dataset, IterDataPipe):
                     assert r.seed is not None
                     shared_rng.manual_seed(r.seed)
-                    dataset = apply_shuffle_seed(dataset, shared_rng)
+                    dataset = apply_random_seed(dataset, shared_rng)
 
                 # Recreate the fetcher for worker-reuse policy
                 fetcher = _DatasetKind.create_fetcher(
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index 30d08b1f2bcb..9ed44e32949a 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -624,7 +624,7 @@ def __init__(self, loader: DataLoader) -> None:
         if isinstance(self._dataset, IterDataPipe):
             shared_rng = torch.Generator()
             shared_rng.manual_seed(self._shared_seed)
-            self._dataset = torch.utils.data.graph_settings.apply_shuffle_seed(self._dataset, shared_rng)
+            self._dataset = torch.utils.data.graph_settings.apply_random_seed(self._dataset, shared_rng)
         self._dataset_kind = loader._dataset_kind
         self._IterableDataset_len_called = loader._IterableDataset_len_called
         self._auto_collation = loader._auto_collation
@@ -665,7 +665,7 @@ def _reset(self, loader, first_iter=False):
         if isinstance(self._dataset, IterDataPipe):
             shared_rng = torch.Generator()
             shared_rng.manual_seed(self._shared_seed)
-            self._dataset = torch.utils.data.graph_settings.apply_shuffle_seed(self._dataset, shared_rng)
+            self._dataset = torch.utils.data.graph_settings.apply_random_seed(self._dataset, shared_rng)
 
     def _next_index(self):
         return next(self._sampler_iter)  # may raise StopIteration
diff --git a/torch/utils/data/datapipes/_hook_iterator.py b/torch/utils/data/datapipes/_hook_iterator.py
index 74ef019de589..270ef9b3d5c5 100644
--- a/torch/utils/data/datapipes/_hook_iterator.py
+++ b/torch/utils/data/datapipes/_hook_iterator.py
@@ -193,7 +193,7 @@ def wrap_generator(*args, **kwargs):
                 single_iterator_msg = "single iterator per IterDataPipe constraint"
                 if hasattr(e.args, '__len__'):
                     full_msg = f"{msg} {datapipe.__class__.__name__}({_generate_input_args_string(datapipe)})"
-                    if len(e.args) == 0:  # If an exception message doesn't exist
+                    if len(e.args) == 0 or not isinstance(e.args[0], str):  # If an exception message doesn't exist
                         e.args = (f'\nThis exception is {full_msg}',)
                     elif msg not in e.args[0] and single_iterator_msg not in e.args[0]:
                         e.args = (e.args[0] + f'\nThis exception is {full_msg}',) + e.args[1:]
diff --git a/torch/utils/data/datapipes/iter/combinatorics.py b/torch/utils/data/datapipes/iter/combinatorics.py
index a11358256cf6..ec5df58f9cdf 100644
--- a/torch/utils/data/datapipes/iter/combinatorics.py
+++ b/torch/utils/data/datapipes/iter/combinatorics.py
@@ -142,10 +142,11 @@ def __len__(self) -> int:
 
     def reset(self) -> None:
         self._buffer = []
-        if self._enabled and self._seed is None:
-            self._seed = int(torch.empty((), dtype=torch.int64).random_().item())
-        self._rng.seed(self._seed)
-        self._seed = None
+        if self._enabled:
+            if self._seed is None:
+                self._seed = int(torch.empty((), dtype=torch.int64).random_().item())
+            self._rng.seed(self._seed)
+            self._seed = None
 
     def __getstate__(self):
         if IterDataPipe.getstate_hook is not None:
diff --git a/torch/utils/data/datapipes/utils/snapshot.py b/torch/utils/data/datapipes/utils/snapshot.py
index 95af98e6b920..feb41ed4d236 100644
--- a/torch/utils/data/datapipes/utils/snapshot.py
+++ b/torch/utils/data/datapipes/utils/snapshot.py
@@ -1,6 +1,6 @@
 from torch.utils.data.datapipes._hook_iterator import _SnapshotState
 from torch.utils.data.datapipes.datapipe import IterDataPipe
-from torch.utils.data.graph_settings import apply_shuffle_seed
+from torch.utils.data.graph_settings import apply_random_seed
 
 
 # TODO: Caveats
@@ -39,7 +39,7 @@ def _simple_graph_snapshot_restoration(datapipe: IterDataPipe, n_iterations: int
     # simple fast-forwarding. Therefore, we need to call `reset` twice, because if `SnapshotState` is `Restored`,
     # the first reset will not actually reset.
     datapipe.reset()  # This ensures `SnapshotState` is `Iterating` by this point, even if it was `Restored`.
-    apply_shuffle_seed(datapipe, rng)
+    apply_random_seed(datapipe, rng)
 
     remainder = n_iterations
     it = iter(datapipe)  # This always reset the DataPipe if it hasn't already.
diff --git a/torch/utils/data/graph_settings.py b/torch/utils/data/graph_settings.py
index 43678a06625d..1542976161cb 100644
--- a/torch/utils/data/graph_settings.py
+++ b/torch/utils/data/graph_settings.py
@@ -1,13 +1,14 @@
+import inspect
 import warnings
 
 from typing import Any, List, Optional, Set
 
 import torch
-import torch.utils.data.datapipes as dp
 
 from torch.utils.data.graph import DataPipe, DataPipeGraph, traverse
 
 __all__ = [
+    "apply_random_seed",
     "apply_sharding",
     "apply_shuffle_seed",
     "apply_shuffle_settings",
@@ -18,6 +19,7 @@
 def get_all_graph_pipes(graph: DataPipeGraph) -> List[DataPipe]:
     return _get_all_graph_pipes_helper(graph, set())
 
+
 def _get_all_graph_pipes_helper(graph: DataPipeGraph, id_cache: Set[int]) -> List[DataPipe]:
     results: List[DataPipe] = []
     for dp_id, (datapipe, sub_graph) in graph.items():
@@ -45,13 +47,29 @@ def apply_sharding(datapipe: DataPipe, num_of_instances: int, instance_id: int)
     return datapipe
 
 
-def apply_shuffle_settings(datapipe: DataPipe, shuffle: Optional[bool]) -> DataPipe:
+def _is_shuffle_datapipe(datapipe: DataPipe) -> bool:
+    if not hasattr(datapipe, "set_shuffle") or not hasattr(datapipe, "set_seed"):
+        return False
+    if not inspect.ismethod(datapipe.set_shuffle) or not inspect.ismethod(datapipe.set_seed):
+        return False
+    return True
+
+
+def apply_shuffle_settings(datapipe: DataPipe, shuffle: Optional[bool] = None) -> DataPipe:
+    r"""
+    Traverse the graph of ``DataPipes`` to find and set shuffle attribute
+    to each `DataPipe` that has APIs of ``set_shuffle`` and ``set_seed``.
+
+    Args:
+        datapipe: DataPipe that needs to set shuffle attribute
+        shuffle: Shuffle option (default: ``None`` and no-op to the graph)
+    """
     if shuffle is None:
         return datapipe
 
     graph = traverse(datapipe, only_datapipe=True)
     all_pipes = get_all_graph_pipes(graph)
-    shufflers = [pipe for pipe in all_pipes if isinstance(pipe, (dp.iter.Shuffler, dp.map.Shuffler))]
+    shufflers = [pipe for pipe in all_pipes if _is_shuffle_datapipe(pipe)]
     if not shufflers and shuffle:
         warnings.warn(
             "`shuffle=True` was set, but the datapipe does not contain a `Shuffler`. Adding one at the end. "
@@ -67,12 +85,43 @@ def apply_shuffle_settings(datapipe: DataPipe, shuffle: Optional[bool]) -> DataP
 
 
 def apply_shuffle_seed(datapipe: DataPipe, rng: Any) -> DataPipe:
+    warnings.warn(
+        "`apply_shuffle_seed` is deprecated since 1.12 and will be removed in the future releases."
+        "\nPlease use `apply_random_seed` instead."
+    )
+    return apply_random_seed(datapipe, rng)
+
+
+def _is_random_datapipe(datapipe: DataPipe) -> bool:
+    if hasattr(datapipe, "set_seed") and inspect.ismethod(datapipe.set_seed):
+        return True
+    return False
+
+
+def apply_random_seed(datapipe: DataPipe, rng: torch.Generator) -> DataPipe:
+    r"""
+    Traverse the graph of ``DataPipes`` to find random ``DataPipe`` with an API of
+    ``set_seed`` then set the random seed based on the provided RNG.
+
+    Args:
+        datapipe: DataPipe that needs to set randomness
+        rng: Random number generator to generate random seeds
+    """
     graph = traverse(datapipe, only_datapipe=True)
     all_pipes = get_all_graph_pipes(graph)
-    shufflers = {pipe for pipe in all_pipes if isinstance(pipe, (dp.iter.Shuffler, dp.map.Shuffler))}
+    # Using a set to track id of DataPipe to prevent setting randomness per DataPipe more than once.
+    # And, `id` is used in case of unhashable DataPipe
+    cache = set()
+    random_datapipes = []
+    for pipe in all_pipes:
+        if id(pipe) in cache:
+            continue
+        if _is_random_datapipe(pipe):
+            random_datapipes.append(pipe)
+            cache.add(id(pipe))
 
-    for shuffler in shufflers:
-        shuffle_seed = int(torch.empty((), dtype=torch.int64).random_(generator=rng).item())
-        shuffler.set_seed(shuffle_seed)
+    for pipe in random_datapipes:
+        random_seed = int(torch.empty((), dtype=torch.int64).random_(generator=rng).item())
+        pipe.set_seed(random_seed)
 
     return datapipe
diff --git a/torchgen/api/dispatcher.py b/torchgen/api/dispatcher.py
index aaab73ef7378..58816959f7cd 100644
--- a/torchgen/api/dispatcher.py
+++ b/torchgen/api/dispatcher.py
@@ -35,7 +35,12 @@ def name(func: FunctionSchema) -> str:
 
 
 def argumenttype_type(
-    t: Type, *, mutable: bool, binds: ArgName, remove_non_owning_ref_types: bool = False
+    t: Type,
+    *,
+    mutable: bool,
+    binds: ArgName,
+    remove_non_owning_ref_types: bool = False,
+    symint: bool = True,
 ) -> NamedCType:
     # This is a faux amis.  If it makes sense in the future to add
     # more special cases here, or invert things so cpp.argument_type
@@ -45,25 +50,30 @@ def argumenttype_type(
         t,
         mutable=mutable,
         binds=binds,
-        symint=True,
+        symint=symint,
         remove_non_owning_ref_types=remove_non_owning_ref_types,
     )
 
 
 def argument_type(
-    a: Argument, *, binds: ArgName, remove_non_owning_ref_types: bool = False
+    a: Argument,
+    *,
+    binds: ArgName,
+    remove_non_owning_ref_types: bool = False,
+    symint: bool = True,
 ) -> NamedCType:
     return argumenttype_type(
         a.type,
         mutable=a.is_write,
         binds=binds,
         remove_non_owning_ref_types=remove_non_owning_ref_types,
+        symint=symint,
     )
 
 
-def returns_type(rs: Sequence[Return]) -> CType:
+def returns_type(rs: Sequence[Return], *, symint: bool = True) -> CType:
     # At present, there is no difference. But there could be!
-    return cpp.returns_type(rs, symint=True)
+    return cpp.returns_type(rs, symint=symint)
 
 
 def jit_arguments(func: FunctionSchema) -> List[Argument]:
@@ -89,15 +99,20 @@ def to_argument(
     )
 
 
-def argument(a: Argument, *, remove_non_owning_ref_types: bool = False) -> Binding:
+def argument(
+    a: Argument, *, remove_non_owning_ref_types: bool = False, symint: bool = True
+) -> Binding:
     return Binding(
         nctype=argument_type(
-            a, binds=a.name, remove_non_owning_ref_types=remove_non_owning_ref_types
+            a,
+            binds=a.name,
+            remove_non_owning_ref_types=remove_non_owning_ref_types,
+            symint=symint,
         ),
         name=a.name,
         argument=a,
     )
 
 
-def arguments(func: FunctionSchema) -> List[Binding]:
-    return [argument(a) for a in jit_arguments(func)]
+def arguments(func: FunctionSchema, *, symint: bool = True) -> List[Binding]:
+    return [argument(a, symint=symint) for a in jit_arguments(func)]
diff --git a/torchgen/api/lazy.py b/torchgen/api/lazy.py
index 2e9ed37da746..606b37006086 100644
--- a/torchgen/api/lazy.py
+++ b/torchgen/api/lazy.py
@@ -203,12 +203,16 @@ class LazyArgument:
     # TODO: this is lies, it is false for symint list
     is_symint_or_list: bool
 
+    # Whether or not we are treating this as symint or not
+    symint: bool
+
     # true if this argument is or contains a lazy IR value
     is_lazy_value: bool
 
-    def __init__(self, arg: Argument, properties: "LazyIrProperties"):
+    def __init__(self, arg: Argument, properties: "LazyIrProperties", *, symint: bool):
         self.name = arg.name
         self.orig_type = arg.type
+        self.symint = symint
         self.is_optional = isinstance(arg.type, OptionalType)
         self.is_generator = isGeneratorType(arg.type)
         if self.is_generator:
@@ -222,7 +226,7 @@ def __init__(self, arg: Argument, properties: "LazyIrProperties"):
         else:
             self.lazy_type_ = process_ir_type(arg.type, properties)
         self.is_wrapped_scalar = isWrappedScalarType(arg.type)
-        self.is_symint_or_list = (
+        self.is_symint_or_list = symint and (
             isSymIntType(arg.type)
             or (isinstance(arg.type, OptionalType) and isSymIntType(arg.type.elem))
             # TODO: lists of symints are not currently treated as value types
@@ -319,6 +323,12 @@ class LazyIrSchema:
     # build a LazyArgument since lazy IR doesn't support it
     generator_arg: Optional[NamedCType] = None
 
+    # original function schema
+    func: FunctionSchema
+
+    # Whether or not we are code-genning for SymInt or not
+    symint: bool
+
     properties: LazyIrProperties = LazyIrProperties(
         # default properties
         "ShapePrecompute",
@@ -328,19 +338,27 @@ class LazyIrSchema:
     opkind: Optional[str] = None
 
     def __init__(
-        self, func: FunctionSchema, properties: Optional[LazyIrProperties] = None
+        self,
+        func: FunctionSchema,
+        properties: Optional[LazyIrProperties] = None,
+        *,
+        symint: bool,
     ):
         if properties:
             self.properties = properties
 
+        self.func = func
+        self.symint = symint
         positional_args: List[LazyArgument] = []
         for arg_field in ["pre_self_positional", "self_arg", "post_self_positional"]:
             if arg_field == "self_arg" and func.arguments.self_arg is not None:
                 arg = getattr(func.arguments, "self_arg").argument
-                positional_args.append(LazyArgument(arg, self.properties))
+                positional_args.append(
+                    LazyArgument(arg, self.properties, symint=symint)
+                )
             elif getattr(func.arguments, arg_field) is not None:
                 positional_args.extend(
-                    LazyArgument(arg, self.properties)
+                    LazyArgument(arg, self.properties, symint=symint)
                     for arg in getattr(func.arguments, arg_field)
                 )
         self.positional_args = tuple(positional_args)
@@ -363,7 +381,8 @@ def __init__(
                         ), "We expect there is only one generator arg"
                         self.generator_arg = NamedCType(arg.name, arg.type)
                 keyword_args.extend(
-                    LazyArgument(arg, self.properties) for arg in curr_args
+                    LazyArgument(arg, self.properties, symint=symint)
+                    for arg in curr_args
                 )
         self.keyword_args = tuple(keyword_args)
         self.name = func.name
diff --git a/torchgen/api/translate.py b/torchgen/api/translate.py
index 0e64fcfb7db9..913b5f673742 100644
--- a/torchgen/api/translate.py
+++ b/torchgen/api/translate.py
@@ -339,7 +339,7 @@ def direct_solve(goal: NamedCType) -> str:
         elif goal.type == BaseCType(symIntArrayRefT):
             try:
                 r = direct_solve(NamedCType(goal.name, BaseCType(intArrayRefT)))
-                return f"c10::SymIntArrayRef::fromIntArrayRef({r})"
+                return f"c10::fromIntArrayRef({r})"
             except UnsatError:
                 return direct_solve(NamedCType(goal.name, longSymVec_ctype))
         elif goal.type == BaseCType(SymIntT):
diff --git a/torchgen/api/types.py b/torchgen/api/types.py
index 9eacacf2fd9d..e8741c0e8f6b 100644
--- a/torchgen/api/types.py
+++ b/torchgen/api/types.py
@@ -577,8 +577,10 @@ class DispatcherSignature:
     # and need to avoid naming collisions.
     prefix: str = ""
 
+    symint: bool = True
+
     def arguments(self) -> List[Binding]:
-        return dispatcher.arguments(self.func)
+        return dispatcher.arguments(self.func, symint=self.symint)
 
     def name(self) -> str:
         return self.prefix + dispatcher.name(self.func)
@@ -604,7 +606,7 @@ def exprs(self) -> List[Expr]:
         return [Expr(a.name, a.nctype) for a in self.arguments()]
 
     def returns_type(self) -> CType:
-        return dispatcher.returns_type(self.func.returns)
+        return dispatcher.returns_type(self.func.returns, symint=self.symint)
 
     def ptr_type(self) -> str:
         dispatcher_args_types_str = ", ".join(a.type for a in self.arguments())
@@ -616,8 +618,10 @@ def type(self) -> str:
         return f"{self.returns_type().cpp_type()} ({dispatcher_args_types_str})"
 
     @staticmethod
-    def from_schema(func: FunctionSchema, *, prefix: str = "") -> "DispatcherSignature":
-        return DispatcherSignature(func, prefix)
+    def from_schema(
+        func: FunctionSchema, *, prefix: str = "", symint: bool = True
+    ) -> "DispatcherSignature":
+        return DispatcherSignature(func, prefix, symint)
 
 
 @dataclass(frozen=True)
@@ -778,15 +782,16 @@ def kernel_signature(
     # so we'd like to keep the differences as small as possible.
     # With external backends, we'd like to enforce that they write their kernels with schemas
     # that match the Dispatcher API directly, if they can.
+    meta = backend_index.get_kernel(f)
+    symint = meta is not None and meta.supports_symint()
+    if symint:
+        assert (
+            f.func.has_symint()
+        ), f"attempted to define symint kernel for {backend_index.dispatch_key} without SymInt in schema"
     if backend_index.external:
-        # Dispatcher signature faithfully does SymInt, which is good for XLA,
-        # not so good for more conventional backends but we don't have any of
-        # those.  If we do, that's time to add a new Signature that is a cross
-        # between DispatcherSignature and NativeSignature
-        assert backend_index.symint
-        return DispatcherSignature.from_schema(f.func, prefix=prefix)
+        return DispatcherSignature.from_schema(f.func, prefix=prefix, symint=symint)
     else:
-        return NativeSignature(f.func, prefix=prefix, symint=backend_index.symint)
+        return NativeSignature(f.func, prefix=prefix, symint=symint)
 
 
 # Functions only, no types
diff --git a/torchgen/dest/lazy_ir.py b/torchgen/dest/lazy_ir.py
index 8148a0e760d9..956176f3d03d 100644
--- a/torchgen/dest/lazy_ir.py
+++ b/torchgen/dest/lazy_ir.py
@@ -19,6 +19,7 @@
     deviceT,
     DispatcherSignature,
     kernel_signature,
+    NativeSignature,
     OptionalCType,
     VectorCType,
 )
@@ -27,6 +28,7 @@
 from torchgen.model import (
     Argument,
     BackendIndex,
+    BackendMetadata,
     BaseTy,
     BaseType,
     FunctionSchema,
@@ -77,7 +79,10 @@ def node_ctor_arg_rvalue_string(arg: LazyArgument) -> str:
         if isinstance(arg.orig_type, ListType) and arg.orig_type.elem == BaseType(
             BaseTy.SymInt
         ):
-            return f"GetSymIntArrayRefValue({arg.name})"
+            if arg.symint:
+                return f"GetSymIntArrayRefValue({arg.name})"
+            else:
+                return f"std::vector<int64_t>({arg.name}.begin(), {arg.name}.end())"
         elif isinstance(arg.lazy_type, VectorCType) and isinstance(
             arg.lazy_type.elem, BaseCType
         ):
@@ -102,13 +107,17 @@ def node_ctor_inputs(schema: LazyIrSchema) -> str:
     return ", ".join(node_ctor_values)
 
 
-def gen_fallback_code(schema: LazyIrSchema, overload_name: str) -> str:
+def gen_fallback_code(
+    schema: LazyIrSchema,
+    sig: Union[DispatcherSignature, NativeSignature],
+    overload_name: str,
+) -> str:
     """
     Generate code that falls back to eager conditioned on a predicate
     """
-    fallback_args = ",\n                ".join(
-        [str(arg.name) for arg in schema.filtered_args(generator=True)]
-    )
+    dispatcher_sig = DispatcherSignature.from_schema(schema.func)
+    exprs = translate(sig.arguments(), dispatcher_sig.arguments())
+    fallback_args = ",\n                ".join([a.expr for a in exprs])
     if len(overload_name):
         aten_op_str = f"ATEN_OP2({schema.aten_name}, {overload_name})"
     else:
@@ -167,7 +176,12 @@ class GenLazyIR(ABC):
     @method_with_native_function
     def __call__(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]:
         func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func
-        schema = LazyIrSchema(func)
+        metadata = self.backend_index.get_kernel(
+            f.functional if isinstance(f, NativeFunctionsGroup) else f
+        )
+        schema = LazyIrSchema(
+            func, symint=metadata is not None and metadata.supports_symint()
+        )
         return self.gen(schema)
 
     # there is no lowering functionality generated unless this IR base class is subclassed and
@@ -444,9 +458,17 @@ def lazy_tensor_decls(self, func: NativeFunction, schema: LazyIrSchema) -> str:
                 )
         return ("\n        ").join(lazy_tensor_decls)
 
-    def force_eager_fallback(self, func: NativeFunction, schema: LazyIrSchema) -> str:
+    def force_eager_fallback(
+        self,
+        func: NativeFunction,
+        schema: LazyIrSchema,
+        metadata: BackendMetadata,
+        sig: Union[DispatcherSignature, NativeSignature],
+    ) -> str:
         if self.gen_forced_fallback_code:
-            return gen_fallback_code(schema, overload_name=func.func.name.overload_name)
+            return gen_fallback_code(
+                schema, sig, overload_name=func.func.name.overload_name
+            )
         return ""
 
     def metrics(self, func: NativeFunction, schema: LazyIrSchema) -> str:
@@ -525,7 +547,9 @@ def this_shape(i: int) -> str:
         auto out_meta = at::{dispatch_ns}::{aten_name}({', '.join(meta_call_args)});
         {meta_out}"""
         else:
-            shape_sig = ComputeShapeSignature(metadata.kernel, func)
+            shape_sig = ComputeShapeSignature(
+                metadata.kernel, func, symint=metadata.supports_symint()
+            )
             shape_str = f"""
             auto shapes = {shape_sig.shape_call};"""
 
@@ -598,11 +622,11 @@ def __call__(self, func: NativeFunction) -> List[str]:
         sig = kernel_signature(func, self.backend_index)
         metadata = self.backend_index.get_kernel(func)
         assert metadata is not None
-        schema = LazyIrSchema(func.func)
+        schema = LazyIrSchema(func.func, symint=metadata.supports_symint())
         return [
             f"""\
     {sig.decl(name=f"{self.class_method_name}::{metadata.kernel}")} {{
-        {self.force_eager_fallback(func, schema)}
+        {self.force_eager_fallback(func, schema, metadata, sig)}
         {self.metrics(func, schema)}
         {self.get_device(func, schema)}
         {self.lazy_tensor_decls(func, schema)}
@@ -618,10 +642,10 @@ class ComputeShapeSignature:
     Here we use the base name as the suffix of the signature to avoid generating for in-place variants.
     """
 
-    def __init__(self, kernel_name: str, f: NativeFunction):
-        self.__schema = LazyIrSchema(f.func)
+    def __init__(self, kernel_name: str, f: NativeFunction, *, symint: bool):
+        self.__schema = LazyIrSchema(f.func, symint=symint)
         self.__dispatch_args = ", ".join(
-            [a.decl() for a in dispatcher.arguments(f.func)]
+            [a.decl() for a in dispatcher.arguments(f.func, symint=symint)]
         )
         self.__call_args = ", ".join(
             [f"{arg.name}" for arg in self.__schema.filtered_args(generator=True)]
@@ -660,7 +684,9 @@ def __call__(self, f: NativeFunction) -> List[str]:
         if is_structured or is_view_copy_op:
             return []
         else:
-            shape_sig = ComputeShapeSignature(metadata.kernel, f)
+            shape_sig = ComputeShapeSignature(
+                metadata.kernel, f, symint=metadata.supports_symint()
+            )
             return ["\n".join([f"{shape_sig.shape_decl};"])]
 
 
@@ -675,7 +701,8 @@ def generate_non_native_lazy_ir_nodes(
         for p in op.get("properties", []):
             setattr(properties, p, True)
 
-        schema = LazyIrSchema(FunctionSchema.parse(op["func"]), properties)
+        # non-native is assumed to want symint bindings if you wrote symint
+        schema = LazyIrSchema(FunctionSchema.parse(op["func"]), properties, symint=True)
         schema.opkind = op.get("opkind")
         nodes.append(gen_lazy_ir.gen(schema)[0])
 
diff --git a/torchgen/dest/register_dispatch_key.py b/torchgen/dest/register_dispatch_key.py
index 2d9a78a912ab..88032bb87197 100644
--- a/torchgen/dest/register_dispatch_key.py
+++ b/torchgen/dest/register_dispatch_key.py
@@ -751,8 +751,11 @@ def gen_one(self, f: NativeFunction) -> Optional[str]:
         )
 
         # Signature of the wrapper function we'll register to the dispatcher
+        kern = self.backend_index.get_kernel(f)
         sig = NativeSignature(
-            f.func, prefix="wrapper_", symint=self.backend_index.symint
+            f.func,
+            prefix="wrapper_",
+            symint=kern is not None and kern.supports_symint(),
         )
 
         if self.target is Target.NAMESPACED_DECLARATION:
diff --git a/torchgen/gen.py b/torchgen/gen.py
index 4fa73a021509..4ea126e54336 100644
--- a/torchgen/gen.py
+++ b/torchgen/gen.py
@@ -20,6 +20,7 @@
 from torchgen.api.translate import translate
 from torchgen.api.types import (
     Binding,
+    CppSignature,
     CppSignatureGroup,
     DispatcherSignature,
     NamedCType,
@@ -161,7 +162,6 @@ def parse_native_yaml_struct(
             device_guard=False,
             # I'm actually not sure about this; undefined could be hit on
             # empty TensorList, hypothetically that could have sizes in it
-            symint=False,
             index={},
         )
     )
@@ -176,16 +176,6 @@ def parse_native_yaml_struct(
             # Only cuda-like devices in tree require device guards
             device_guard=is_cuda_dispatch_key(k),
             index=v,
-            # Which dispatch keys natively support symint
-            # Note: DispatchKey.CompositeExplicitAutograd has to match out
-            # composites; I think there's some factoring problem here
-            symint=k
-            in [
-                DispatchKey.Meta,
-                DispatchKey.CompositeImplicitAutograd,
-                DispatchKey.CompositeExplicitAutograd,
-                DispatchKey.CompositeExplicitAutogradNonFunctional,
-            ],
         )
     return ParsedYaml(rs, indices)
 
@@ -359,11 +349,12 @@ def static_dispatch_extra_headers(backends: List[BackendIndex]) -> List[str]:
     ]
 
 
-# Translates arguments of a native function from DispatcherSignature form to CppSignature form with support for
-# supporting usecases even when there is a memory_format argument along with tensor_option arguments.
-# This usecase is not covered by tools.codegen.api.translate() yet as its application is limited to static dispatch
-def translate_args_dispatcher_to_cpp(
-    f: NativeFunction,
+# Translates arguments of `sig` to CppSignature bindings.
+# Note that we have a special case for `memory_format` argument and this case is not covered by
+# tools.codegen.api.translate() yet as its application is limited to static dispatch.
+def translate_args(
+    sig: Union[CppSignature, DispatcherSignature],
+    cpp_sig: CppSignature,
 ) -> str:
 
     # Adds SpecialArgName.possibly_redundant_memory_format NamedCType for memory_format bindings
@@ -385,27 +376,33 @@ def add_spl_memory_format_binding(input_bindings: List[Binding]) -> List[Binding
                 output_bindings.append(binding)
         return output_bindings
 
-    disp_sig = DispatcherSignature.from_schema(f.func)
-    cpp_sig = CppSignatureGroup.from_native_function(
-        f, method=False, fallback_binding=False
-    ).signature
-    disp_bindings = disp_sig.arguments()
+    src_bindings = list(sig.arguments())
+    goal_bindings = list(cpp_sig.arguments())
     # When last argument of CPP signature has SpecialArgName.possibly_redundant_memory_format NCType,
     # get memory_format bindings of dispatcher signature to have the same NCType as well
-    for arg in cpp_sig.arguments():
+    for arg in goal_bindings:
         if arg.nctype.name == SpecialArgName.possibly_redundant_memory_format:
-            disp_bindings = add_spl_memory_format_binding(disp_sig.arguments())
+            src_bindings = add_spl_memory_format_binding(src_bindings)
             break
-    exprs = translate(disp_bindings, cpp_sig.arguments())
+    exprs = translate(src_bindings, goal_bindings)
     return ", ".join(a.expr for a in exprs)
 
 
 def generate_static_dispatch_backend_call(
+    sig: Union[CppSignature, DispatcherSignature],
     f: NativeFunction,
     backend_index: BackendIndex,
 ) -> str:
-    name = DispatcherSignature.from_schema(f.func).name()
-    exprs = translate_args_dispatcher_to_cpp(f)
+    cpp_sigs = CppSignatureGroup.from_native_function(
+        f, method=False, fallback_binding=False
+    )
+    if sig.symint and f.func.has_symint():
+        cpp_sig = cpp_sigs.symint_signature
+    else:
+        cpp_sig = cpp_sigs.signature
+    assert cpp_sig is not None
+    name = cpp_sig.name()
+    exprs = translate_args(sig, cpp_sig)
     backend_metadata = backend_index.get_kernel(f)
     kernel_ns = (
         backend_metadata.cpp_namespace
@@ -417,11 +414,20 @@ def generate_static_dispatch_backend_call(
 
 
 def generate_static_dispatch_fallback_call(
+    sig: Union[CppSignature, DispatcherSignature],
     f: NativeFunction,
     backend_indices: List[BackendIndex],
 ) -> str:
-    name = DispatcherSignature.from_schema(f.func).name()
-    exprs = translate_args_dispatcher_to_cpp(f)
+    cpp_sigs = CppSignatureGroup.from_native_function(
+        f, method=False, fallback_binding=False
+    )
+    if sig.symint and f.func.has_symint():
+        cpp_sig = cpp_sigs.symint_signature
+    else:
+        cpp_sig = cpp_sigs.signature
+    assert cpp_sig is not None
+    name = cpp_sig.name()
+    exprs = translate_args(sig, cpp_sig)
     ns = DEFAULT_KERNEL_NAMESPACE.replace("::native", "")
     if f.has_composite_explicit_autograd_kernel:
         return f"return {ns}::{DispatchKey.CompositeExplicitAutograd.lower()}::{name}({exprs});"
@@ -437,9 +443,20 @@ def generate_static_dispatch_fallback_call(
 
 
 def static_dispatch(
+    sig: Union[CppSignature, DispatcherSignature],
     f: NativeFunction,
     backend_indices: List[BackendIndex],
 ) -> str:
+    """
+    For a given `NativeFunction`, find out the corresponding backend and dispatch to it. If more than one
+    backends exsit, fallback to static dispatch by determining dispatch key from inputs.
+    Arguments:
+        sig: A CppSignature or DispatcherSignature for this native function we want to use.
+        f: NativeFunction to generate static dispatch.
+        backend_indices: All available backends.
+    Return:
+        C++ code to call backend-specific functions, e.g., "return at::cpu::add(self, other, scale);"
+    """
     if len(backend_indices) == 0 or f.manual_kernel_registration:
         return ""
 
@@ -453,11 +470,10 @@ def static_dispatch(
         )
     ]
     if len(keys) == 1:
-        return generate_static_dispatch_backend_call(f, keys[0])
+        return generate_static_dispatch_backend_call(sig, f, keys[0])
     elif len(keys) == 0:
-        return generate_static_dispatch_fallback_call(f, backend_indices)
+        return generate_static_dispatch_fallback_call(sig, f, backend_indices)
 
-    sig = DispatcherSignature.from_schema(f.func)
     native_tensor_args = [
         a.name
         for a in sig.arguments()
@@ -483,10 +499,10 @@ def static_dispatch(
     for index in keys:
         dispatch_code.append(f"""case DispatchKey::{index.dispatch_key}:""")
         dispatch_code.append(
-            f"""\t{generate_static_dispatch_backend_call(f, index)};"""
+            f"""\t{generate_static_dispatch_backend_call(sig, f, index)};"""
         )
 
-    fallback = generate_static_dispatch_fallback_call(f, backend_indices)
+    fallback = generate_static_dispatch_fallback_call(sig, f, backend_indices)
     connector = "\n\t\t"
 
     return f"""
@@ -528,8 +544,6 @@ class ComputeOperators:
     def __call__(self, f: NativeFunction) -> str:
         sig = DispatcherSignature.from_schema(f.func)
         name = f.func.name.unambiguous_name()
-        call_method_name = "call"
-        redispatch_method_name = "redispatch"
 
         if self.target is Target.DECLARATION:
             # Note [The ATen Operators API]
@@ -563,8 +577,8 @@ def __call__(self, f: NativeFunction) -> str:
   STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::{f.func.name.name}")
   STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "{f.func.name.overload_name}")
   STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, {cpp_string(str(f.func))})
-  static {sig.defn(name=call_method_name, is_redispatching_fn=False)};
-  static {sig.defn(name=redispatch_method_name, is_redispatching_fn=True)};
+  static {sig.defn(name="call", is_redispatching_fn=False)};
+  static {sig.defn(name="redispatch", is_redispatching_fn=True)};
 }};"""
 
         elif self.target is Target.DEFINITION:
@@ -585,12 +599,13 @@ def __call__(self, f: NativeFunction) -> str:
                     dispatcher_exprs_str = ", ".join(
                         ["dispatchKeySet"] + [a.name for a in sig.arguments()]
                     )
-                    dispatcher_call = "redispatch"
-                    method_name = f"{name}::{redispatch_method_name}"
+                    method_base = "redispatch"
                 else:
-                    method_name = f"{name}::{call_method_name}"
                     dispatcher_exprs_str = ", ".join([a.name for a in sig.arguments()])
-                    dispatcher_call = "call"
+                    method_base = "call"
+
+                dispatcher_call = method_base
+                method_name = f"{name}::{method_base}"
 
                 fn_body = f"""
     static auto op = create_{name}_typed_handle();
@@ -602,7 +617,7 @@ def __call__(self, f: NativeFunction) -> str:
                 ):
                     # call() should go through static dispatch
                     fn_body = static_dispatch(
-                        f, backend_indices=self.static_dispatch_backend_indices
+                        sig, f, backend_indices=self.static_dispatch_backend_indices
                     )
                 defns += f"""
 // aten::{f.func}
diff --git a/torchgen/gen_backend_stubs.py b/torchgen/gen_backend_stubs.py
index a8108a514118..aecc18747ce2 100644
--- a/torchgen/gen_backend_stubs.py
+++ b/torchgen/gen_backend_stubs.py
@@ -3,7 +3,7 @@
 import pathlib
 import re
 from collections import Counter, defaultdict, namedtuple
-from typing import Dict, List, Optional, Sequence, Union
+from typing import Dict, List, Optional, Sequence, Set, Union
 
 import yaml
 
@@ -68,6 +68,7 @@ def parse_backend_yaml(
         "full_codegen",
         "non_native",
         "ir_gen",
+        "symint",
     ]
 
     backend = yaml_values.pop("backend", None)
@@ -96,6 +97,14 @@ def parse_backend_yaml(
         supported, list
     ), f'expected "supported" to be a list, but got: {supported} (of type {type(supported)})'
 
+    symint = yaml_values.pop("symint", [])
+    if symint is None:
+        symint = []  # Allow an empty list of symint ops
+    assert isinstance(
+        symint, list
+    ), f'expected "symint" to be a list, but got: {supported} (of type {type(supported)})'
+    symint_set = set(symint)
+
     supported_autograd = yaml_values.pop("autograd", [])
     assert isinstance(
         supported_autograd, list
@@ -118,6 +127,7 @@ def parse_backend_yaml(
 
     def create_backend_index(
         backend_ops: List[str],
+        symint_ops: Set[str],
         dispatch_key: DispatchKey,
         *,
         use_out_as_primary: bool,
@@ -131,6 +141,8 @@ def create_backend_index(
             ), f"Found an invalid operator name: {op_name}"
             # See Note [External Backends Follow Dispatcher API]
             kernel_name = dispatcher.name(native_functions_map[op_name].func)
+            if op in symint_ops:
+                kernel_name += "_symint"
             # TODO: allow structured external backends later.
             m = BackendMetadata(
                 kernel=kernel_name, structured=False, cpp_namespace=cpp_namespace
@@ -140,7 +152,6 @@ def create_backend_index(
             dispatch_key=dispatch_key,
             use_out_as_primary=use_out_as_primary,
             external=True,
-            symint=True,  # TODO: make this configurable
             device_guard=use_device_guard,
             index=metadata,
         )
@@ -154,6 +165,7 @@ def create_backend_index(
 
         backend_idx = create_backend_index(
             supported,
+            symint_set,
             backend_key,
             use_out_as_primary=use_out_as_primary,
             use_device_guard=use_device_guard,
@@ -171,6 +183,7 @@ def create_backend_index(
 
         autograd_idx = create_backend_index(
             supported_autograd,
+            symint_set,
             autograd_key,
             use_out_as_primary=use_out_as_primary,
             use_device_guard=use_device_guard,
@@ -257,30 +270,49 @@ def error_on_missing_kernels(
     if full_codegen is None:
         full_codegen = []
 
-    expected_backend_op_names: List[OperatorName] = (
-        list(backend_indices[backend_key].index.keys()) + []
-        if autograd_key is None
-        else list(backend_indices[autograd_key].index.keys())
+    indices = [backend_indices[backend_key].index] + (
+        [] if autograd_key is None else [backend_indices[autograd_key].index]
+    )
+    # Quick mapping from each OperatorName used by the external backend
+    # to its backend kernel name
+    expected_backend_op_names: Dict[OperatorName, str] = dict(
+        list(
+            concatMap(
+                lambda index: [
+                    (op_name, metadata.kernel) for op_name, metadata in index.items()
+                ],
+                indices,
+            )
+        )
     )
     expected_backend_native_funcs: List[NativeFunction] = [
         f
         for f in native_functions
-        if f.func.name in expected_backend_op_names and f.func.name not in full_codegen
+        if f.func.name in expected_backend_op_names.keys()
+        and f.func.name not in full_codegen
     ]
     expected_backend_kernel_name_counts: Dict[str, List[NativeFunction]] = defaultdict(
         list
     )
     for native_f in expected_backend_native_funcs:
-        expected_backend_kernel_name_counts[dispatcher.name(native_f.func)].append(
-            native_f
-        )
+        expected_backend_kernel_name_counts[
+            expected_backend_op_names[native_f.func.name]
+        ].append(native_f)
 
     # This just looks for lines containing "foo(", and assumes that the kernel foo has been implemented.
     # It might cause false negatives (we won't catch all cases), but that's ok - if we catch a missing kernel
     # here, then we get a nicer error message. If we miss it, you get a linker error.
-    kernel_defn_regex = rf"{class_name}::\s*([\w\d]*)\("
+    kernel_defn_regex = rf"(.*){class_name}::\s*([\w\d]*)\("
     actual_backend_kernel_name_counts = Counter(
-        re.findall(kernel_defn_regex, backend_defns)
+        # A bit unwieldy (this could probably be moved into regex),
+        # but we don't want to include kernel names that come from function calls,
+        # like "return torch_xla::XLANativeFunctions::empty_strided_symint(...)".
+        # Easy check is to ignore any lines with colons before the class name.
+        [
+            y
+            for (x, y) in re.findall(kernel_defn_regex, backend_defns)
+            if not x.endswith(":")
+        ]
     )
 
     missing_kernels_err_msg = ""
diff --git a/torchgen/gen_functionalization_type.py b/torchgen/gen_functionalization_type.py
index 107d5737c3f7..fd35e5cb2788 100644
--- a/torchgen/gen_functionalization_type.py
+++ b/torchgen/gen_functionalization_type.py
@@ -84,21 +84,21 @@ def gen_composite_view_copy_kernel(g: NativeFunctionsViewGroup) -> Optional[str]
     # clone() calls in their graph (which is normally needed by reshape).
     if str(g.view_copy.func.name) == "view_copy":
         return """\
-at::Tensor view_copy(const at::Tensor & self, at::SymIntArrayRef size) {
-  // TODO: don't cast to int array ref
-  auto int_size = c10::asIntArrayRefSlow(size);
-  DimVector shape = infer_size_dv(int_size, self.numel());
+at::Tensor view_copy(const at::Tensor & self, at::IntArrayRef size) {
+  DimVector shape = infer_size_dv(size, self.numel());
   if (!at::detail::computeStride(self.sizes(), self.strides(), shape).has_value()) {
-    return self.reshape(int_size);
+    return self.reshape(size);
   } else {
-    auto output = at::_ops::view::call(self, size);
+    auto output = at::_ops::view::call(self, c10::fromIntArrayRef(size));
     return output.clone();
   }
 }
 """
     # view_copy is a native signature, since we're generating an at::native:: kernel
     # Functionalization always operates on symints though
-    view_copy_sig = NativeSignature(g.view_copy.func, symint=True)
+    view_copy_sig = NativeSignature(
+        g.view_copy.func, symint=False
+    )  # TODO: flag day this True
 
     # view is a dispatcher signature, since we're calling into the at::_ops API
     view_sig = DispatcherSignature(g.view.func)
@@ -641,7 +641,7 @@ def emit_registration_helper(f: NativeFunction) -> str:
             metadata = composite_implicit_autograd_index.get_kernel(f)
             assert metadata is not None
             native_api_name = metadata.kernel
-            sig = DispatcherSignature.from_schema(f.func)
+            sig = NativeSignature(f.func, symint=metadata.supports_symint())
             # Note [Composite view ops in the functionalization pass]
             # We don't need to worry about implemententing functionalization kernels for views with
             # CompositeImplicitAutograd kernels, because we can just decompose them into their base operators.
diff --git a/torchgen/gen_lazy_tensor.py b/torchgen/gen_lazy_tensor.py
index a5a1fd9b2535..ab0d2a1b4cac 100644
--- a/torchgen/gen_lazy_tensor.py
+++ b/torchgen/gen_lazy_tensor.py
@@ -166,7 +166,7 @@ def get_ltc_helper_fns() -> str:
 at::Tensor to_meta(const at::Tensor& tensor) {
   // undefined tensors can't be converted to the meta device, since they don't have sizes/strides
   if (!tensor.defined()) return tensor;
-  auto out = at::native::empty_strided_meta(tensor.sizes(), tensor.strides(), \
+  auto out = at::native::empty_strided_meta_symint(tensor.sym_sizes(), tensor.sym_strides(), \
 /*dtype=*/c10::make_optional(tensor.scalar_type()), /*layout=*/c10::make_optional(tensor.layout()), \
 /*device=*/c10::make_optional(c10::Device(c10::kMeta)), /*pin_memory=*/c10::nullopt);
   // needs to handle wrapped numbers, so dtype promotion works properly.
diff --git a/torchgen/gen_vmap_plumbing.py b/torchgen/gen_vmap_plumbing.py
index ac1413a1845b..263a4842ad49 100644
--- a/torchgen/gen_vmap_plumbing.py
+++ b/torchgen/gen_vmap_plumbing.py
@@ -79,7 +79,7 @@ def gen_unwraps(
 
 
 def gen_case_where_all_bdims_are_none(
-    schema: FunctionSchema, cur_level_var: str
+    outer_sig: DispatcherSignature, schema: FunctionSchema, cur_level_var: str
 ) -> str:
     conditions = []
     flat_args = schema.arguments.flat_all
@@ -90,7 +90,7 @@ def gen_case_where_all_bdims_are_none(
 
     sig = DispatcherSignature.from_schema(schema)
     translated_args = ", ".join(
-        e.expr for e in translate(sig.arguments(), sig.arguments())
+        e.expr for e in translate(outer_sig.arguments(), sig.arguments())
     )
     return f"""\
 if ({' && '.join(conditions)}) {{
@@ -160,12 +160,12 @@ def gen_vmap_inplace_plumbing(native_function: NativeFunction) -> Optional[str]:
     cur_level_var = "cur_level"
 
     unwraps, unwrapped_arg_list = gen_unwraps(schema.arguments.flat_all, cur_level_var)
-    bdims_all_none_case = gen_case_where_all_bdims_are_none(schema, cur_level_var)
+    bdims_all_none_case = gen_case_where_all_bdims_are_none(sig, schema, cur_level_var)
 
     return f"""\
 template <typename batch_rule_t, batch_rule_t batch_rule>
 {sig.decl(name=schema.name.unambiguous_name() + '_generated_plumbing')} {{
-  c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
   TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
   int64_t {cur_level_var} = maybe_layer->layerId();
@@ -182,12 +182,12 @@ def gen_vmap_plumbing_no_returns(native_function: NativeFunction) -> str:
     cur_level_var = "cur_level"
 
     unwraps, unwrapped_arg_list = gen_unwraps(schema.arguments.flat_all, cur_level_var)
-    bdims_all_none_case = gen_case_where_all_bdims_are_none(schema, cur_level_var)
+    bdims_all_none_case = gen_case_where_all_bdims_are_none(sig, schema, cur_level_var)
 
     return f"""\
 template <typename batch_rule_t, batch_rule_t batch_rule>
 {sig.decl(name=schema.name.unambiguous_name() + '_generated_plumbing')} {{
-  c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
   TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
   int64_t {cur_level_var} = maybe_layer->layerId();
@@ -224,13 +224,13 @@ def gen_vmap_plumbing(native_function: NativeFunction) -> Optional[str]:
     cur_level_var = "cur_level"
 
     unwraps, unwrapped_arg_list = gen_unwraps(schema.arguments.flat_all, cur_level_var)
-    bdims_all_none_case = gen_case_where_all_bdims_are_none(schema, cur_level_var)
+    bdims_all_none_case = gen_case_where_all_bdims_are_none(sig, schema, cur_level_var)
 
     wrapped_returns = gen_returns(returns, cur_level_var, results_var)
     return f"""\
 template <typename batch_rule_t, batch_rule_t batch_rule>
 {sig.decl(name=schema.name.unambiguous_name() + '_generated_plumbing')} {{
-  c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
   TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
   int64_t {cur_level_var} = maybe_layer->layerId();
@@ -256,7 +256,6 @@ def gen_all_vmap_plumbing(native_functions: Sequence[NativeFunction]) -> str:
 #pragma once
 #include <ATen/Operators.h>
 #include <functorch/csrc/PlumbingHelper.h>
-#include <functorch/csrc/Constants.h>
 
 namespace at {{ namespace functorch {{
 
diff --git a/torchgen/model.py b/torchgen/model.py
index 81fc05760afb..dec0ab8f7f10 100644
--- a/torchgen/model.py
+++ b/torchgen/model.py
@@ -78,6 +78,7 @@ class DispatchKey(Enum):
     SparseCsrCPU = auto()
     SparseCsrCUDA = auto()
 
+    Python = auto()
     ZeroTensor = auto()
     BackendSelect = auto()
     Named = auto()
@@ -708,9 +709,14 @@ def from_yaml(
 
         assert len(composites_in_dispatch) <= 1 or (
             len(composites_in_dispatch) == 2
-            and DispatchKey.CompositeImplicitAutograd in composites_in_dispatch
-            and DispatchKey.CompositeImplicitAutogradNestedTensor
-            in composites_in_dispatch
+            and (
+                DispatchKey.CompositeExplicitAutogradNonFunctional
+                not in composites_in_dispatch
+            )
+            and (
+                DispatchKey.CompositeImplicitAutogradNestedTensor
+                in composites_in_dispatch
+            )
         ), (
             "cannot specify more than one of CompositeExplicitAutograd, CompositeExplicitAutogradNonFunctional, "
             "or CompositeImplicitAutograd on a single kernel; each "
@@ -1098,6 +1104,9 @@ class BackendMetadata:
     # The namespace for kernels, default value: DEFAULT_KERNEL_NAMESPACE
     cpp_namespace: str
 
+    def supports_symint(self) -> bool:
+        return "_symint" in self.kernel
+
 
 @dataclass(frozen=True)
 class UfuncInnerLoop:
@@ -1141,8 +1150,6 @@ class BackendIndex:
     external: bool
     # Other backend-specific information that is on a per-operator basis
     index: Dict["OperatorName", BackendMetadata]
-    # Whether or not this backend handles symbolic ints or not
-    symint: bool
 
     @staticmethod
     def grow_index(
diff --git a/torchgen/native_function_generation.py b/torchgen/native_function_generation.py
index bf8503ed6403..fc66184d8711 100644
--- a/torchgen/native_function_generation.py
+++ b/torchgen/native_function_generation.py
@@ -71,6 +71,7 @@
     "qscheme",  # returns a QScheme
     "record_stream",  # no return
     "sparse_dim",  # returns an int
+    "_nested_tensor_offsets",  # returns a vector of ints
 ]
 
 INPLACE_OPS_THAT_DONT_GET_GROUPED_PROPERLY = [
@@ -304,6 +305,8 @@ def generate_function(
         if func.kind() == SchemaKind.out
         else cpp.name(func)
     )
+    if f.func.has_symint():
+        kernel_name += "_symint"
     backend_metadata = {
         DispatchKey.CompositeExplicitAutograd: {
             func.name: BackendMetadata(
@@ -555,7 +558,7 @@ def gen_composite_functional_kernel(g: NativeFunctionsGroup) -> Optional[str]:
 
     clone_mutable_inputs_str = "\n".join(clone_mutable_inputs)
     return f"""
-{sig.defn()} {{
+{sig.defn(name=sig.name() + ("_symint" if g.out.func.has_symint() else ""))} {{
   {clone_mutable_inputs_str}
   {maybe_assign}at::_ops::{target_f.func.name.unambiguous_name()}::call({exprs});
   {ret_str}
@@ -615,7 +618,7 @@ def gen_composite_out_kernel(g: NativeFunctionsGroup) -> Optional[str]:
 
     # Kernel name needs to follow the naming convention defined in `generate_function()`
     return f"""
-{sig.defn(name=g.out.func.name.unambiguous_name())} {{
+{sig.defn(name=g.out.func.name.unambiguous_name() + ("_symint" if g.out.func.has_symint() else ""))} {{
   auto {out_name} = at::_ops::{g.functional.func.name.unambiguous_name()}::call({exprs});
   {copy_outs_str}
   {return_str(g.out.func.returns, rets)}
diff --git a/torchgen/utils.py b/torchgen/utils.py
index 64c218170802..017259083422 100644
--- a/torchgen/utils.py
+++ b/torchgen/utils.py
@@ -293,6 +293,14 @@ def write_outputs(self, variable_name: str, filename: str) -> None:
         )
         self._write_if_changed(filename, content)
 
+    def template_dir_for_comments(self) -> str:
+        """
+        This needs to be deterministic. The template dir is an absolute path
+        that varies across builds. So, just use the path relative to this file,
+        which will point to the codegen source but will be stable.
+        """
+        return os.path.relpath(self.template_dir, os.path.dirname(__file__))
+
 
 # Helper function to generate file manager
 def make_file_manager(